From 01389e1d88e6b43b9478b879456948ad5dc2fca1 Mon Sep 17 00:00:00 2001 From: PaperChalice Date: Thu, 26 Jun 2025 11:00:24 +0800 Subject: [PATCH 1/5] [DAGCombiner] Remove UnsafeFPMath usage in visitFSUBForFMACombine --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 8 +- llvm/test/CodeGen/AMDGPU/fma-combine.ll | 342 ++++++++++-------- .../AMDGPU/fmul-2-combine-multi-use.ll | 10 +- llvm/test/CodeGen/AMDGPU/mad-combine.ll | 190 +++++++--- llvm/test/CodeGen/PowerPC/fma-combine.ll | 41 +-- 5 files changed, 362 insertions(+), 229 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 91f696e8fe88e..9e6e81e2c0dee 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17010,8 +17010,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { return SDValue(); const SDNodeFlags Flags = N->getFlags(); - bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath || HasFMAD); + bool AllowFusionGlobally = + (Options.AllowFPOpFusion == FPOpFusion::Fast || HasFMAD); // If the subtraction is not contractable, do not combine. if (!AllowFusionGlobally && !N->getFlags().hasAllowContract()) @@ -17167,7 +17167,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { } auto isReassociable = [&Options](SDNode *N) { - return Options.UnsafeFPMath || N->getFlags().hasAllowReassociation(); + return N->getFlags().hasAllowReassociation(); }; auto isContractableAndReassociableFMUL = [&isContractableFMUL, @@ -17181,7 +17181,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // More folding opportunities when target permits. if (Aggressive && isReassociable(N)) { - bool CanFuse = Options.UnsafeFPMath || N->getFlags().hasAllowContract(); + bool CanFuse = N->getFlags().hasAllowContract(); // fold (fsub (fma x, y, (fmul u, v)), z) // -> (fma x, y (fma u, v, (fneg z))) if (CanFuse && isFusedOp(N0) && diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll index a96d022b66f12..c79cf87712dc0 100644 --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -1,11 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-NOFMA %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s - -; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs -enable-no-infs-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-NOFMA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -verify-machineinstrs -enable-no-infs-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be ; beneficial even without fp32 denormals, but they do require no-infs-fp-math @@ -65,8 +63,8 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p %b = load volatile double, ptr addrspace(1) %gep.1 %c = load volatile double, ptr addrspace(1) %gep.2 - %mul = fmul double %a, %b - %fma = fadd double %mul, %c + %mul = fmul contract double %a, %b + %fma = fadd contract double %mul, %c store double %fma, ptr addrspace(1) %gep.out ret void } @@ -134,9 +132,9 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o %c = load volatile double, ptr addrspace(1) %gep.2 %d = load volatile double, ptr addrspace(1) %gep.3 - %mul = fmul double %a, %b - %fma0 = fadd double %mul, %c - %fma1 = fadd double %mul, %d + %mul = fmul contract double %a, %b + %fma0 = fadd contract double %mul, %c + %fma1 = fadd contract double %mul, %d store volatile double %fma0, ptr addrspace(1) %gep.out.0 store volatile double %fma1, ptr addrspace(1) %gep.out.1 ret void @@ -190,8 +188,8 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p %b = load volatile double, ptr addrspace(1) %gep.1 %c = load volatile double, ptr addrspace(1) %gep.2 - %mul = fmul double %a, %b - %fma = fadd double %c, %mul + %mul = fmul contract double %a, %b + %fma = fadd contract double %c, %mul store double %fma, ptr addrspace(1) %gep.out ret void } @@ -244,8 +242,8 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o %b = load volatile double, ptr addrspace(1) %gep.1 %c = load volatile double, ptr addrspace(1) %gep.2 - %mul = fmul double %a, %b - %fma = fsub double %mul, %c + %mul = fmul contract double %a, %b + %fma = fsub contract double %mul, %c store double %fma, ptr addrspace(1) %gep.out ret void } @@ -313,9 +311,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali %c = load volatile double, ptr addrspace(1) %gep.2 %d = load volatile double, ptr addrspace(1) %gep.3 - %mul = fmul double %a, %b - %fma0 = fsub double %mul, %c - %fma1 = fsub double %mul, %d + %mul = fmul contract double %a, %b + %fma0 = fsub contract double %mul, %c + %fma1 = fsub contract double %mul, %d store volatile double %fma0, ptr addrspace(1) %gep.out.0 store volatile double %fma1, ptr addrspace(1) %gep.out.1 ret void @@ -369,8 +367,8 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o %b = load volatile double, ptr addrspace(1) %gep.1 %c = load volatile double, ptr addrspace(1) %gep.2 - %mul = fmul double %a, %b - %fma = fsub double %c, %mul + %mul = fmul contract double %a, %b + %fma = fsub contract double %c, %mul store double %fma, ptr addrspace(1) %gep.out ret void } @@ -438,9 +436,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali %c = load volatile double, ptr addrspace(1) %gep.2 %d = load volatile double, ptr addrspace(1) %gep.3 - %mul = fmul double %a, %b - %fma0 = fsub double %c, %mul - %fma1 = fsub double %d, %mul + %mul = fmul contract double %a, %b + %fma0 = fsub contract double %c, %mul + %fma1 = fsub contract double %d, %mul store volatile double %fma0, ptr addrspace(1) %gep.out.0 store volatile double %fma1, ptr addrspace(1) %gep.out.1 ret void @@ -494,9 +492,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o %b = load volatile double, ptr addrspace(1) %gep.1 %c = load volatile double, ptr addrspace(1) %gep.2 - %mul = fmul double %a, %b - %mul.neg = fsub double -0.0, %mul - %fma = fsub double %mul.neg, %c + %mul = fmul contract double %a, %b + %mul.neg = fsub contract double -0.0, %mul + %fma = fsub contract double %mul.neg, %c store double %fma, ptr addrspace(1) %gep.out ret void @@ -565,10 +563,10 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) %c = load volatile double, ptr addrspace(1) %gep.2 %d = load volatile double, ptr addrspace(1) %gep.3 - %mul = fmul double %a, %b - %mul.neg = fsub double -0.0, %mul - %fma0 = fsub double %mul.neg, %c - %fma1 = fsub double %mul.neg, %d + %mul = fmul contract double %a, %b + %mul.neg = fsub contract double -0.0, %mul + %fma0 = fsub contract double %mul.neg, %c + %fma1 = fsub contract double %mul.neg, %d store volatile double %fma0, ptr addrspace(1) %gep.out.0 store volatile double %fma1, ptr addrspace(1) %gep.out.1 @@ -638,10 +636,10 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) %c = load volatile double, ptr addrspace(1) %gep.2 %d = load volatile double, ptr addrspace(1) %gep.3 - %mul = fmul double %a, %b - %mul.neg = fsub double -0.0, %mul - %fma0 = fsub double %mul.neg, %c - %fma1 = fsub double %mul, %d + %mul = fmul contract double %a, %b + %mul.neg = fsub contract double -0.0, %mul + %fma0 = fsub contract double %mul.neg, %c + %fma1 = fsub contract double %mul, %d store volatile double %fma0, ptr addrspace(1) %gep.out.0 store volatile double %fma1, ptr addrspace(1) %gep.out.1 @@ -650,32 +648,6 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { -; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: -; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s6, 0 -; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; SI-NOFMA-NEXT: v_mov_b32_e32 v1, 0 -; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOFMA-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NOFMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc -; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc -; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc -; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc -; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc -; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NOFMA-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11] -; SI-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9] -; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] -; SI-NOFMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; SI-NOFMA-NEXT: s_endpgm -; ; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; SI-FMA: ; %bb.0: ; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -701,30 +673,6 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; SI-FMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-FMA-NEXT: s_endpgm ; -; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: -; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 -; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9] -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7] -; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] -; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] -; GFX11-NOFMA-NEXT: s_endpgm -; ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -761,18 +709,16 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) %u = load volatile double, ptr addrspace(1) %gep.3 %v = load volatile double, ptr addrspace(1) %gep.4 - %tmp0 = fmul double %u, %v - %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0 - %tmp2 = fsub double %tmp1, %z + %tmp0 = fmul contract fast double %u, %v + %tmp1 = call contract fast double @llvm.fma.f64(double %x, double %y, double %tmp0) #0 + %tmp2 = fsub contract fast double %tmp1, %z store double %tmp2, ptr addrspace(1) %gep.out ret void } -; fold (fsub x, (fma y, z, (fmul u, v))) -; -> (fma (fneg y), z, (fma (fneg u), v, x)) -define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { -; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: +define amdgpu_kernel void @no_aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +; SI-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_0_f64: ; SI-NOFMA: ; %bb.0: ; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 @@ -793,11 +739,59 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NOFMA-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11] -; SI-NOFMA-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] -; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5] +; SI-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] ; SI-NOFMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NOFMA-NEXT: s_endpgm ; +; GFX11-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_0_f64: +; GFX11-NOFMA: ; %bb.0: +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 +; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9] +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7] +; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] +; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-NOFMA-NEXT: s_endpgm + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 + %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2 + %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3 + %gep.4 = getelementptr double, ptr addrspace(1) %gep.0, i32 4 + %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid + + %x = load volatile double, ptr addrspace(1) %gep.0 + %y = load volatile double, ptr addrspace(1) %gep.1 + %z = load volatile double, ptr addrspace(1) %gep.2 + %u = load volatile double, ptr addrspace(1) %gep.3 + %v = load volatile double, ptr addrspace(1) %gep.4 + + %tmp0 = fmul double %u, %v + %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0 + %tmp2 = fsub double %tmp1, %z + + store double %tmp2, ptr addrspace(1) %gep.out + ret void +} + +; fold (fsub x, (fma y, z, (fmul u, v))) +; -> (fma (fneg y), z, (fma (fneg u), v, x)) +define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; SI-FMA: ; %bb.0: ; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -823,30 +817,6 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; SI-FMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-FMA-NEXT: s_endpgm ; -; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: -; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 -; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9] -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] -; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] -; GFX11-NOFMA-NEXT: s_endpgm -; ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -883,6 +853,78 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) %u = load volatile double, ptr addrspace(1) %gep.3 %v = load volatile double, ptr addrspace(1) %gep.4 + ; nsz flag is needed since this combine may change sign of zero + %tmp0 = fmul contract reassoc nsz double %u, %v + %tmp1 = call contract reassoc nsz double @llvm.fma.f64(double %y, double %z, double %tmp0) #0 + %tmp2 = fsub contract reassoc nsz double %x, %tmp1 + + store double %tmp2, ptr addrspace(1) %gep.out + ret void +} +define amdgpu_kernel void @no_aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +; SI-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_1_f64: +; SI-NOFMA: ; %bb.0: +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s6, 0 +; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NOFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOFMA-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NOFMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NOFMA-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11] +; SI-NOFMA-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5] +; SI-NOFMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-NOFMA-NEXT: s_endpgm +; +; GFX11-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_1_f64: +; GFX11-NOFMA: ; %bb.0: +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 +; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9] +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-NOFMA-NEXT: s_endpgm + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 + %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2 + %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3 + %gep.4 = getelementptr double, ptr addrspace(1) %gep.0, i32 4 + %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid + + %x = load volatile double, ptr addrspace(1) %gep.0 + %y = load volatile double, ptr addrspace(1) %gep.1 + %z = load volatile double, ptr addrspace(1) %gep.2 + %u = load volatile double, ptr addrspace(1) %gep.3 + %v = load volatile double, ptr addrspace(1) %gep.4 + ; nsz flag is needed since this combine may change sign of zero %tmp0 = fmul nsz double %u, %v %tmp1 = call nsz double @llvm.fma.f64(double %y, double %z, double %tmp0) #0 @@ -979,8 +1021,8 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, ptr addrspace(1) %in2) { %x = load volatile float, ptr addrspace(1) %in1 %y = load volatile float, ptr addrspace(1) %in2 - %a = fadd float %x, 1.0 - %m = fmul float %a, %y + %a = fadd contract float %x, 1.0 + %m = fmul contract float %a, %y store float %m, ptr addrspace(1) %out ret void } @@ -1068,8 +1110,8 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, ptr addrspace(1) %in2) { %x = load volatile float, ptr addrspace(1) %in1 %y = load volatile float, ptr addrspace(1) %in2 - %a = fadd float %x, 1.0 - %m = fmul float %y, %a + %a = fadd contract float %x, 1.0 + %m = fmul contract float %y, %a store float %m, ptr addrspace(1) %out ret void } @@ -1157,8 +1199,8 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 %y = load float, ptr addrspace(1) %in2 - %a = fadd float %x, -1.0 - %m = fmul float %a, %y + %a = fadd contract float %x, -1.0 + %m = fmul contract float %a, %y store float %m, ptr addrspace(1) %out ret void } @@ -1246,8 +1288,8 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 %y = load float, ptr addrspace(1) %in2 - %a = fadd float %x, -1.0 - %m = fmul float %y, %a + %a = fadd contract float %x, -1.0 + %m = fmul contract float %y, %a store float %m, ptr addrspace(1) %out ret void } @@ -1335,8 +1377,8 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 %y = load float, ptr addrspace(1) %in2 - %s = fsub float 1.0, %x - %m = fmul float %s, %y + %s = fsub contract float 1.0, %x + %m = fmul contract float %s, %y store float %m, ptr addrspace(1) %out ret void } @@ -1424,8 +1466,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 %y = load float, ptr addrspace(1) %in2 - %s = fsub float 1.0, %x - %m = fmul float %y, %s + %s = fsub contract float 1.0, %x + %m = fmul contract float %y, %s store float %m, ptr addrspace(1) %out ret void } @@ -1513,8 +1555,8 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 %y = load float, ptr addrspace(1) %in2 - %s = fsub float -1.0, %x - %m = fmul float %s, %y + %s = fsub contract float -1.0, %x + %m = fmul contract float %s, %y store float %m, ptr addrspace(1) %out ret void } @@ -1602,8 +1644,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 %y = load float, ptr addrspace(1) %in2 - %s = fsub float -1.0, %x - %m = fmul float %y, %s + %s = fsub contract float -1.0, %x + %m = fmul contract float %y, %s store float %m, ptr addrspace(1) %out ret void } @@ -1691,8 +1733,8 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 %y = load float, ptr addrspace(1) %in2 - %s = fsub float %x, 1.0 - %m = fmul float %s, %y + %s = fsub contract float %x, 1.0 + %m = fmul contract float %s, %y store float %m, ptr addrspace(1) %out ret void } @@ -1780,8 +1822,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 %y = load float, ptr addrspace(1) %in2 - %s = fsub float %x, 1.0 - %m = fmul float %y, %s + %s = fsub contract float %x, 1.0 + %m = fmul contract float %y, %s store float %m, ptr addrspace(1) %out ret void } @@ -1869,8 +1911,8 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 %y = load float, ptr addrspace(1) %in2 - %s = fsub float %x, -1.0 - %m = fmul float %s, %y + %s = fsub contract float %x, -1.0 + %m = fmul contract float %s, %y store float %m, ptr addrspace(1) %out ret void } @@ -1958,8 +2000,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 %y = load float, ptr addrspace(1) %in2 - %s = fsub float %x, -1.0 - %m = fmul float %y, %s + %s = fsub contract float %x, -1.0 + %m = fmul contract float %y, %s store float %m, ptr addrspace(1) %out ret void } @@ -2072,10 +2114,10 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, %x = load float, ptr addrspace(1) %in1 %y = load float, ptr addrspace(1) %in2 %t = load float, ptr addrspace(1) %in3 - %t1 = fsub float 1.0, %t - %tx = fmul float %x, %t - %ty = fmul float %y, %t1 - %r = fadd float %tx, %ty + %t1 = fsub contract float 1.0, %t + %tx = fmul contract float %x, %t + %ty = fmul contract float %y, %t1 + %r = fadd contract float %tx, %ty store float %r, ptr addrspace(1) %out ret void } @@ -2152,10 +2194,10 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, %x = load double, ptr addrspace(1) %in1 %y = load double, ptr addrspace(1) %in2 %t = load double, ptr addrspace(1) %in3 - %t1 = fsub double 1.0, %t - %tx = fmul double %x, %t - %ty = fmul double %y, %t1 - %r = fadd double %tx, %ty + %t1 = fsub contract double 1.0, %t + %tx = fmul contract double %x, %t + %ty = fmul contract double %y, %t1 + %r = fadd contract double %tx, %ty store double %r, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index 7c0d3692242a4..d4471c85c467c 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -73,8 +73,8 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo %a16 = select i1 %a15, float %a12, float %a14 %a17 = fmul float %a16, 2.0 %a18 = fmul float %a17, %a17 - %a19 = fmul float %a18, %a17 - %a20 = fsub float 1.0, %a19 + %a19 = fmul contract float %a18, %a17 + %a20 = fsub contract float 1.0, %a19 store float %a20, ptr addrspace(1) %out ret void } @@ -540,8 +540,8 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 %a16 = select i1 %a15, half %a12, half %a14 %a17 = fmul half %a16, 2.0 %a18 = fmul half %a17, %a17 - %a19 = fmul half %a18, %a17 - %a20 = fsub half 1.0, %a19 + %a19 = fmul contract half %a18, %a17 + %a20 = fsub contract half 1.0, %a19 store half %a20, ptr addrspace(1) %out ret void } @@ -1166,7 +1166,7 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ret void } -attributes #0 = { nounwind "unsafe-fp-math"="true" } +attributes #0 = { nounwind } attributes #1 = { nounwind readnone } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX11-DENORM: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/mad-combine.ll b/llvm/test/CodeGen/AMDGPU/mad-combine.ll index e94aa4b8ce3d1..2ac181b06a350 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-combine.ll @@ -1,14 +1,12 @@ ; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma. ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s - -; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s ; Make sure we don't form mad with denormals -; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s declare i32 @llvm.amdgcn.workitem.id.x() #0 declare float @llvm.fabs.f32(float) #0 @@ -25,15 +23,41 @@ declare float @llvm.fmuladd.f32(float, float, float) #0 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] +; SI-DENORM-FASTFMAF: buffer_store_dword [[RESULT]] +; SI-STD: buffer_store_dword [[C]] +define amdgpu_kernel void @combine_to_mad_f32_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 + %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2 + %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid + + %a = load volatile float, ptr addrspace(1) %gep.0 + %b = load volatile float, ptr addrspace(1) %gep.1 + %c = load volatile float, ptr addrspace(1) %gep.2 + + + %mul = fmul contract float %a, %b + %fma = fadd contract float %mul, %c + store float %fma, ptr addrspace(1) %gep.out + ret void +} +; FUNC-LABEL: {{^}}no_combine_to_mad_f32_0: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} + +; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]] + ; SI-DENORM-SLOWFMAF-NOT: v_fma ; SI-DENORM-SLOWFMAF-NOT: v_mad ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] -; SI-DENORM: buffer_store_dword [[RESULT]] +; SI-DENORM-SLOWFMAF: buffer_store_dword [[RESULT]] ; SI-STD: buffer_store_dword [[C]] -define amdgpu_kernel void @combine_to_mad_f32_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +define amdgpu_kernel void @no_combine_to_mad_f32_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -72,7 +96,46 @@ define amdgpu_kernel void @combine_to_mad_f32_0(ptr addrspace(1) noalias %out, p ; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI: s_endpgm -define amdgpu_kernel void @combine_to_mad_f32_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +define amdgpu_kernel void @combine_to_mad_f32_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_fast) #1 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 + %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2 + %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3 + %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid + %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1 + + %a = load volatile float, ptr addrspace(1) %gep.0 + %b = load volatile float, ptr addrspace(1) %gep.1 + %c = load volatile float, ptr addrspace(1) %gep.2 + %d = load volatile float, ptr addrspace(1) %gep.3 + + %mul = fmul contract fast float %a, %b + %fma0 = fadd contract fast float %mul, %c + %fma1 = fadd contract fast float %mul, %d + store volatile float %fma0, ptr addrspace(1) %gep.out.0 + store volatile float %fma1, ptr addrspace(1) %gep.out.1 + ret void +} +; FUNC-LABEL: {{^}}no_combine_to_mad_f32_0_2use: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}} + +; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]] +; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] +; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] +; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] + +; SI-DENORM-SLOWFMAF-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DENORM-SLOWFMAF-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI: s_endpgm +define amdgpu_kernel void @no_combine_to_mad_f32_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_fast) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -89,7 +152,6 @@ define amdgpu_kernel void @combine_to_mad_f32_0_2use(ptr addrspace(1) noalias %o %mul = fmul float %a, %b %fma0 = fadd float %mul, %c %fma1 = fadd float %mul, %d - store volatile float %fma0, ptr addrspace(1) %gep.out.0 store volatile float %fma1, ptr addrspace(1) %gep.out.1 ret void @@ -120,8 +182,8 @@ define amdgpu_kernel void @combine_to_mad_f32_1(ptr addrspace(1) noalias %out, p %b = load volatile float, ptr addrspace(1) %gep.1 %c = load volatile float, ptr addrspace(1) %gep.2 - %mul = fmul float %a, %b - %fma = fadd float %c, %mul + %mul = fmul contract float %a, %b + %fma = fadd contract float %c, %mul store float %fma, ptr addrspace(1) %gep.out ret void } @@ -150,8 +212,8 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32(ptr addrspace(1) noalias %o %b = load volatile float, ptr addrspace(1) %gep.1 %c = load volatile float, ptr addrspace(1) %gep.2 - %mul = fmul float %a, %b - %fma = fsub float %mul, %c + %mul = fmul contract float %a, %b + %fma = fsub contract float %mul, %c store float %fma, ptr addrspace(1) %gep.out ret void } @@ -190,9 +252,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(ptr addrspace(1) noali %c = load volatile float, ptr addrspace(1) %gep.2 %d = load volatile float, ptr addrspace(1) %gep.3 - %mul = fmul float %a, %b - %fma0 = fsub float %mul, %c - %fma1 = fsub float %mul, %d + %mul = fmul contract float %a, %b + %fma0 = fsub contract float %mul, %c + %fma1 = fsub contract float %mul, %d store volatile float %fma0, ptr addrspace(1) %gep.out.0 store volatile float %fma1, ptr addrspace(1) %gep.out.1 ret void @@ -222,8 +284,8 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32(ptr addrspace(1) noalias %o %b = load volatile float, ptr addrspace(1) %gep.1 %c = load volatile float, ptr addrspace(1) %gep.2 - %mul = fmul float %a, %b - %fma = fsub float %c, %mul + %mul = fmul contract float %a, %b + %fma = fsub contract float %c, %mul store float %fma, ptr addrspace(1) %gep.out ret void } @@ -262,9 +324,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(ptr addrspace(1) noali %c = load volatile float, ptr addrspace(1) %gep.2 %d = load volatile float, ptr addrspace(1) %gep.3 - %mul = fmul float %a, %b - %fma0 = fsub float %c, %mul - %fma1 = fsub float %d, %mul + %mul = fmul contract float %a, %b + %fma0 = fsub contract float %c, %mul + %fma1 = fsub contract float %d, %mul store volatile float %fma0, ptr addrspace(1) %gep.out.0 store volatile float %fma1, ptr addrspace(1) %gep.out.1 ret void @@ -295,9 +357,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32(ptr addrspace(1) noalias %o %b = load volatile float, ptr addrspace(1) %gep.1 %c = load volatile float, ptr addrspace(1) %gep.2 - %mul = fmul float %a, %b - %mul.neg = fneg float %mul - %fma = fsub float %mul.neg, %c + %mul = fmul contract float %a, %b + %mul.neg = fneg contract float %mul + %fma = fsub contract float %mul.neg, %c store float %fma, ptr addrspace(1) %gep.out ret void @@ -337,10 +399,10 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(ptr addrspace(1) %c = load volatile float, ptr addrspace(1) %gep.2 %d = load volatile float, ptr addrspace(1) %gep.3 - %mul = fmul float %a, %b - %mul.neg = fneg float %mul - %fma0 = fsub float %mul.neg, %c - %fma1 = fsub float %mul.neg, %d + %mul = fmul contract float %a, %b + %mul.neg = fneg contract float %mul + %fma0 = fsub contract float %mul.neg, %c + %fma1 = fsub contract float %mul.neg, %d store volatile float %fma0, ptr addrspace(1) %gep.out.0 store volatile float %fma1, ptr addrspace(1) %gep.out.1 @@ -381,10 +443,10 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(ptr addrspace(1) %c = load volatile float, ptr addrspace(1) %gep.2 %d = load volatile float, ptr addrspace(1) %gep.3 - %mul = fmul float %a, %b - %mul.neg = fneg float %mul - %fma0 = fsub float %mul.neg, %c - %fma1 = fsub float %mul, %d + %mul = fmul contract float %a, %b + %mul.neg = fneg contract float %mul + %fma0 = fsub contract float %mul.neg, %c + %fma1 = fsub contract float %mul, %d store volatile float %fma0, ptr addrspace(1) %gep.out.0 store volatile float %fma1, ptr addrspace(1) %gep.out.1 @@ -412,7 +474,7 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(ptr addrspace(1) ; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]] ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_aggressive) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -427,10 +489,22 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(ptr addrspace(1) %u = load volatile float, ptr addrspace(1) %gep.3 %v = load volatile float, ptr addrspace(1) %gep.4 - %tmp0 = fmul float %u, %v - %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0 - %tmp2 = fsub float %tmp1, %z + br i1 %is_aggressive, label %aggressive, label %normal + +normal: + %tmp0_normal = fmul float %u, %v + %tmp1_normal = call float @llvm.fma.f32(float %x, float %y, float %tmp0_normal) #0 + %tmp2_normal = fsub float %tmp1_normal, %z + br label %exit +aggressive: + %tmp0_aggressive = fmul contract reassoc float %u, %v + %tmp1_aggressive = call contract reassoc float @llvm.fma.f32(float %x, float %y, float %tmp0_aggressive) #0 + %tmp2_aggressive = fsub contract reassoc float %tmp1_aggressive, %z + br label %exit + +exit: + %tmp2 = phi float [%tmp2_normal, %normal], [%tmp2_aggressive, %aggressive] store float %tmp2, ptr addrspace(1) %gep.out ret void } @@ -505,7 +579,7 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(ptr addrspace(1) ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: s_endpgm -define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_aggressive) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -520,10 +594,22 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1) %u = load volatile float, ptr addrspace(1) %gep.3 %v = load volatile float, ptr addrspace(1) %gep.4 - %tmp0 = fmul float %u, %v - %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0 - %tmp2 = fsub float %tmp1, %z + br i1 %is_aggressive, label %aggressive, label %normal +normal: + %tmp0_normal = fmul float %u, %v + %tmp1_normal = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0_normal) #0 + %tmp2_normal = fsub float %tmp1_normal, %z + br label %exit + +aggressive: + %tmp0_aggressive = fmul contract reassoc float %u, %v + %tmp1_aggressive = call contract reassoc float @llvm.fmuladd.f32(float %x, float %y, float %tmp0_aggressive) #0 + %tmp2_aggressive = fsub contract reassoc float %tmp1_aggressive, %z + br label %exit + +exit: + %tmp2 = phi float [%tmp2_normal, %normal], [%tmp2_aggressive, %aggressive] store float %tmp2, ptr addrspace(1) %gep.out ret void } @@ -556,7 +642,7 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1) ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: s_endpgm -define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_aggressive) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -571,11 +657,23 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(ptr addrspace(1) %u = load volatile float, ptr addrspace(1) %gep.3 %v = load volatile float, ptr addrspace(1) %gep.4 - ; nsz flag is needed since this combine may change sign of zero - %tmp0 = fmul nsz float %u, %v - %tmp1 = call nsz float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0 - %tmp2 = fsub nsz float %x, %tmp1 + br i1 %is_aggressive, label %aggressive, label %normal +normal: + ; nsz flag is needed since this combine may change sign of zero + %tmp0_normal = fmul nsz float %u, %v + %tmp1_normal = call nsz float @llvm.fmuladd.f32(float %y, float %z, float %tmp0_normal) #0 + %tmp2_normal = fsub nsz float %x, %tmp1_normal + br label %exit + +aggressive: + %tmp0_aggressive = fmul contract reassoc nsz float %u, %v + %tmp1_aggressive = call contract reassoc nsz float @llvm.fmuladd.f32(float %y, float %z, float %tmp0_aggressive) #0 + %tmp2_aggressive = fsub contract reassoc nsz float %x, %tmp1_aggressive + br label %exit + +exit: + %tmp2 = phi float [%tmp2_normal, %normal], [%tmp2_aggressive, %aggressive] store float %tmp2, ptr addrspace(1) %gep.out ret void } diff --git a/llvm/test/CodeGen/PowerPC/fma-combine.ll b/llvm/test/CodeGen/PowerPC/fma-combine.ll index 3d45e9a3a509c..456f85ad3eefd 100644 --- a/llvm/test/CodeGen/PowerPC/fma-combine.ll +++ b/llvm/test/CodeGen/PowerPC/fma-combine.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -enable-no-signed-zeros-fp-math \ -; RUN: -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CHECK-FAST %s +; RUN: < %s | FileCheck -check-prefix=CHECK-FAST %s ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -enable-no-signed-zeros-fp-math \ -; RUN: -enable-unsafe-fp-math -mattr=-vsx < %s | FileCheck -check-prefix=CHECK-FAST-NOVSX %s +; RUN: -mattr=-vsx < %s | FileCheck -check-prefix=CHECK-FAST-NOVSX %s ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s define dso_local double @fma_combine1(double %a, double %b, double %c) { @@ -19,13 +19,12 @@ define dso_local double @fma_combine1(double %a, double %b, double %c) { ; CHECK-LABEL: fma_combine1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xsnegdp 0, 3 -; CHECK-NEXT: xsmuldp 0, 0, 2 -; CHECK-NEXT: xssubdp 1, 0, 1 +; CHECK-NEXT: xsmsubadp 1, 0, 2 ; CHECK-NEXT: blr entry: %fneg1 = fneg double %c - %mul = fmul double %fneg1, %b - %add = fsub double %mul, %a + %mul = fmul contract double %fneg1, %b + %add = fsub contract double %mul, %a ret double %add } @@ -43,13 +42,12 @@ define dso_local double @fma_combine2(double %a, double %b, double %c) { ; CHECK-LABEL: fma_combine2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xsnegdp 0, 3 -; CHECK-NEXT: xsmuldp 0, 2, 0 -; CHECK-NEXT: xssubdp 1, 0, 1 +; CHECK-NEXT: xsmsubadp 1, 2, 0 ; CHECK-NEXT: blr entry: %fneg1 = fneg double %c - %mul = fmul double %b, %fneg1 - %add = fsub double %mul, %a + %mul = fmul contract double %b, %fneg1 + %add = fsub contract double %mul, %a ret double %add } @@ -85,17 +83,16 @@ define dso_local double @fma_combine_two_uses(double %a, double %b, double %c) { ; CHECK-NEXT: stfd 0, v@toc@l(3) ; CHECK-NEXT: xsnegdp 0, 3 ; CHECK-NEXT: addis 3, 2, z@toc@ha +; CHECK-NEXT: xsmsubadp 1, 0, 2 ; CHECK-NEXT: stfd 0, z@toc@l(3) -; CHECK-NEXT: xsmuldp 0, 0, 2 -; CHECK-NEXT: xssubdp 1, 0, 1 ; CHECK-NEXT: blr entry: %fneg = fneg double %a store double %fneg, ptr @v, align 8 %fneg1 = fneg double %c store double %fneg1, ptr @z, align 8 - %mul = fmul double %fneg1, %b - %add = fsub double %mul, %a + %mul = fmul contract double %fneg1, %b + %add = fsub contract double %mul, %a ret double %add } @@ -122,15 +119,14 @@ define dso_local double @fma_combine_one_use(double %a, double %b, double %c) { ; CHECK-NEXT: addis 3, 2, v@toc@ha ; CHECK-NEXT: stfd 0, v@toc@l(3) ; CHECK-NEXT: xsnegdp 0, 3 -; CHECK-NEXT: xsmuldp 0, 0, 2 -; CHECK-NEXT: xssubdp 1, 0, 1 +; CHECK-NEXT: xsmsubadp 1, 0, 2 ; CHECK-NEXT: blr entry: %fneg = fneg double %a store double %fneg, ptr @v, align 8 %fneg1 = fneg double %c - %mul = fmul double %fneg1, %b - %add = fsub double %mul, %a + %mul = fmul contract double %fneg1, %b + %add = fsub contract double %mul, %a ret double %add } @@ -327,15 +323,12 @@ define dso_local double @fma_combine_const(double %a, double %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addis 3, 2, .LCPI9_0@toc@ha ; CHECK-NEXT: lfd 0, .LCPI9_0@toc@l(3) -; CHECK-NEXT: addis 3, 2, .LCPI9_1@toc@ha -; CHECK-NEXT: xsmuldp 0, 1, 0 -; CHECK-NEXT: lfd 1, .LCPI9_1@toc@l(3) -; CHECK-NEXT: xsmaddadp 2, 0, 1 +; CHECK-NEXT: xsmaddadp 2, 1, 0 ; CHECK-NEXT: fmr 1, 2 ; CHECK-NEXT: blr entry: - %0 = fmul double %a, 1.1 - %1 = call contract double @llvm.fma.f64(double %0, double 2.1, double %b) + %0 = fmul reassoc double %a, 1.1 + %1 = call contract reassoc double @llvm.fma.f64(double %0, double 2.1, double %b) ret double %1 } From 8aa53e4470f3b7c79f7a2acb04526e4bb9d22f98 Mon Sep 17 00:00:00 2001 From: PaperChalice Date: Thu, 26 Jun 2025 12:24:00 +0800 Subject: [PATCH 2/5] remove `UnsafeFPMath` usage in `visitFMULForFMADistributiveCombine` --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 9e6e81e2c0dee..7cf3a8f7a07b7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16737,7 +16737,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) { static bool isContractableFMUL(const TargetOptions &Options, SDValue N) { assert(N.getOpcode() == ISD::FMUL); - return Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || + return Options.AllowFPOpFusion == FPOpFusion::Fast || N->getFlags().hasAllowContract(); } @@ -17338,8 +17338,7 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) { // Floating-point multiply-add with intermediate rounding. This can result // in a less precise result due to the changed rounding order. - bool HasFMAD = Options.UnsafeFPMath && - (LegalOperations && TLI.isFMADLegal(DAG, N)); + bool HasFMAD = LegalOperations && TLI.isFMADLegal(DAG, N); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) From b2778411cfa48a91c5fb68ac17b8253e3b52f2e7 Mon Sep 17 00:00:00 2001 From: PaperChalice Date: Thu, 26 Jun 2025 15:32:15 +0800 Subject: [PATCH 3/5] remove `UnsafeFPMath` usage in visitFDIV --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 7 +- llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll | 1956 +++++++++++++++ llvm/test/CodeGen/AMDGPU/rsq.f32.ll | 2194 +++-------------- llvm/test/CodeGen/NVPTX/sqrt-approx.ll | 16 +- .../test/CodeGen/X86/change-unsafe-fp-math.ll | 34 +- llvm/test/CodeGen/X86/fdiv.ll | 4 +- 6 files changed, 2358 insertions(+), 1853 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 7cf3a8f7a07b7..6c7b1499664b7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18237,8 +18237,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { // Only do the transform if the reciprocal is a legal fp immediate that // isn't too nasty (eg NaN, denormal, ...). if (((st == APFloat::opOK && !Recip.isDenormal()) || - (st == APFloat::opInexact && - (Options.UnsafeFPMath || Flags.hasAllowReciprocal()))) && + (st == APFloat::opInexact && Flags.hasAllowReciprocal())) && (!LegalOperations || // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM // backend)... we should handle this gracefully after Legalize. @@ -18249,7 +18248,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { DAG.getConstantFP(Recip, DL, VT)); } - if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) { + if (Flags.hasAllowReciprocal()) { // If this FDIV is part of a reciprocal square root, it may be folded // into a target-specific square root estimate instruction. if (N1.getOpcode() == ISD::FSQRT) { @@ -18324,7 +18323,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { // Fold X/Sqrt(X) -> Sqrt(X) if ((Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) && - (Options.UnsafeFPMath || Flags.hasAllowReassociation())) + Flags.hasAllowReassociation()) if (N1.getOpcode() == ISD::FSQRT && N0 == N1.getOperand(0)) return N1; diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll new file mode 100644 index 0000000000000..7f822c135ffb4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll @@ -0,0 +1,1956 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 + +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-SAFE,SI-DAZ-SAFE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-SAFE,SI-IEEE-SAFE %s + +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-SAFE,CI-DAZ-SAFE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-SAFE,CI-IEEE-SAFE %s + + +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone +declare float @llvm.sqrt.f32(float) nounwind readnone +declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone + +define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { +; GCN-DAZ-SAFE-LABEL: rsq_f32: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7 +; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3 +; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0xf800000 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1 +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-DAZ-SAFE-NEXT: s_endpgm +; +; SI-IEEE-SAFE-LABEL: rsq_f32: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 +; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 +; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 +; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-IEEE-SAFE-NEXT: s_endpgm +; +; CI-IEEE-SAFE-LABEL: rsq_f32: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 +; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 +; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 +; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-IEEE-SAFE-NEXT: s_endpgm + %val = load float, ptr addrspace(1) %in, align 4 + %sqrt = call contract float @llvm.sqrt.f32(float %val) nounwind readnone + %div = fdiv contract float 1.0, %sqrt, !fpmath !0 + store float %div, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %val) { +; GCN-DAZ-SAFE-LABEL: rsq_f32_sgpr: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_load_dword s0, s[4:5], 0xb +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, -1 +; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, s0 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v3, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v3, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v2, v2, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v3, v1, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-DAZ-SAFE-NEXT: s_endpgm +; +; SI-IEEE-SAFE-LABEL: rsq_f32_sgpr: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v2, vcc, -1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, 1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-IEEE-SAFE-NEXT: s_endpgm +; +; CI-IEEE-SAFE-LABEL: rsq_f32_sgpr: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_load_dword s0, s[4:5], 0xb +; CI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v2, vcc, -1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, 1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-IEEE-SAFE-NEXT: s_endpgm + %sqrt = call contract float @llvm.sqrt.f32(float %val) nounwind readnone + %div = fdiv contract float 1.0, %sqrt, !fpmath !0 + store float %div, ptr addrspace(1) %out, align 4 + ret void +} + +; Recognize that this is rsqrt(a) * rcp(b) * c, +; not 1 / ( 1 / sqrt(a)) * rcp(b) * c. + +; NOTE: c * rcp( sqrt(a) * b ) is generated when we move rcp generation to AMGGPUCogenPrepare. +define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN-DAZ-SAFE-LABEL: rsqrt_fmul: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0 +; GCN-DAZ-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v1, 0 +; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-DAZ-SAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-SAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-SAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s0, 0xf800000 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v6, 0x260 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v2 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v5, v2 +; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[0:1], s[4:5] +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, v2, v5 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0.5, v5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v5, v7, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, v7, v8, v7 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v8, v5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v7, v7, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v8, v5, v7 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, 0x37800000, v5 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v2, v6 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v2, v3 +; GCN-DAZ-SAFE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v4 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v5, v3 +; GCN-DAZ-SAFE-NEXT: v_div_scale_f32 v6, vcc, v4, v2, v4 +; GCN-DAZ-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v7, v5, v5 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, v6, v5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v3, v7, v6 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, v8, v5, v7 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v3, v7, v6 +; GCN-DAZ-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GCN-DAZ-SAFE-NEXT: v_div_fmas_f32 v3, v3, v5, v7 +; GCN-DAZ-SAFE-NEXT: v_div_fixup_f32 v2, v3, v2, v4 +; GCN-DAZ-SAFE-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-DAZ-SAFE-NEXT: s_endpgm +; +; GCN-IEEE-SAFE-LABEL: rsqrt_fmul: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, 0 +; GCN-IEEE-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-IEEE-SAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-SAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-SAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v6, 0x260 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v2 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v5, v2 +; GCN-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[0:1], -1, v5 +; GCN-IEEE-SAFE-NEXT: v_add_i32_e64 v8, s[0:1], 1, v5 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v9, -v7, v5, v2 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v10, -v8, v5, v2 +; GCN-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v9 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[0:1] +; GCN-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v10 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[0:1] +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v7, 0x37800000, v5 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GCN-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v2, v6 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, v2, v3 +; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, v4 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v5, v3 +; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v6, vcc, v4, v2, v4 +; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[0:1], s[4:5] +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, v7, v5, v5 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v7, v6, v5 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v8, -v3, v7, v6 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v7, v8, v5, v7 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v3, -v3, v7, v6 +; GCN-IEEE-SAFE-NEXT: v_div_fmas_f32 v3, v3, v5, v7 +; GCN-IEEE-SAFE-NEXT: v_div_fixup_f32 v2, v3, v2, v4 +; GCN-IEEE-SAFE-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-IEEE-SAFE-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 + %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2 + + %a = load volatile float, ptr addrspace(1) %gep.0 + %b = load volatile float, ptr addrspace(1) %gep.1 + %c = load volatile float, ptr addrspace(1) %gep.2 + + %x = call contract float @llvm.sqrt.f32(float %a) + %y = fmul contract float %x, %b + %z = fdiv contract float %c, %y + store float %z, ptr addrspace(1) %out.gep + ret void +} + +define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { +; GCN-DAZ-SAFE-LABEL: neg_rsq_f32: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7 +; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3 +; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0xf800000 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1 +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-DAZ-SAFE-NEXT: s_endpgm +; +; SI-IEEE-SAFE-LABEL: neg_rsq_f32: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 +; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 +; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 +; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[0:1] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-IEEE-SAFE-NEXT: s_endpgm +; +; CI-IEEE-SAFE-LABEL: neg_rsq_f32: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 +; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 +; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 +; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-IEEE-SAFE-NEXT: s_endpgm + %val = load float, ptr addrspace(1) %in, align 4 + %sqrt = call contract float @llvm.sqrt.f32(float %val) + %div = fdiv contract float -1.0, %sqrt, !fpmath !0 + store float %div, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { +; GCN-DAZ-SAFE-LABEL: neg_rsq_neg_f32: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7 +; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3 +; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0x8f800000 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1 +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-DAZ-SAFE-NEXT: s_endpgm +; +; SI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 +; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 +; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x8f800000 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, s[0:1] +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 +; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[0:1] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-IEEE-SAFE-NEXT: s_endpgm +; +; CI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 +; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 +; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x8f800000 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, s[0:1] +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 +; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-IEEE-SAFE-NEXT: s_endpgm + %val = load float, ptr addrspace(1) %in, align 4 + %val.fneg = fneg float %val + %sqrt = call contract float @llvm.sqrt.f32(float %val.fneg) + %div = fdiv contract float -1.0, %sqrt, !fpmath !0 + store float %div, ptr addrspace(1) %out, align 4 + ret void +} + +define float @v_neg_rsq_neg_f32(float %val) { +; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v3, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v2, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v3, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %val.fneg = fneg float %val + %sqrt = call contract float @llvm.sqrt.f32(float %val.fneg) + %div = fdiv contract float -1.0, %sqrt, !fpmath !0 + ret float %div +} + +define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) { +; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, 0x4f800000 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v2, -v1, s5 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v2, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v1, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v3, -v0, s5 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v3, v0 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v4, 0x260 +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v4 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v3 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0.5, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v2, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v5, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v2, v2, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v5, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v6, v3, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v2, -v1, s7 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v2, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v0, s7 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v4, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v1, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v2, -v1, s7 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v2, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v0, s7 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v4, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %val.fneg = fneg <2 x float> %val + %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val.fneg) + %div = fdiv contract <2 x float> , %sqrt, !fpmath !0 + ret <2 x float> %div +} + +define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) { +; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %val0.neg = fneg float %val0 + %sqrt = call contract float @llvm.sqrt.f32(float %val0.neg) + %div = fdiv contract float -1.0, %sqrt, !fpmath !0 + %user = fmul contract float %div, %val1 + ret float %user +} + +define <2 x float> @v_neg_rsq_neg_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) { +; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, 0x4f800000 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v4, -v1, s5 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v4, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v4, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, v1, v4 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, 0.5, v4 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v4, v5, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v6, v5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v5, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v6, v4 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v7, v4, v5 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v5, -v0, s5 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v5, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v5, v0 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v6, 0x260 +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v6 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, v0, v5 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0.5, v5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v4, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v7, v4 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v4, v4, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v7, v5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v8, v5, v4 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v6 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v1, s7 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v4, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v6, -v0, s7 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v6, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v0, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v1, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v1, s7 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v4, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v6, -v0, s7 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v6, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %val0.fneg = fneg <2 x float> %val0 + %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0.fneg) + %div = fdiv contract <2 x float> , %sqrt, !fpmath !0 + %user = fmul contract <2 x float> %div, %val1 + ret <2 x float> %user +} + +define float @v_neg_rsq_f32(float %val) { +; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v3, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v2, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v3, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract float @llvm.sqrt.f32(float %val) + %div = fdiv contract float -1.0, %sqrt, !fpmath !0 + ret float %div +} + +define <2 x float> @v_neg_rsq_v2f32(<2 x float> %val) { +; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v1 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v1, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x4f800000, v0 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v3, v0 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v4, 0x260 +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v4 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v3 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0.5, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v2, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v5, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v2, v2, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v5, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v6, v3, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v1, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val) + %div = fdiv contract <2 x float> , %sqrt, !fpmath !0 + ret <2 x float> %div +} + +define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) { +; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32_foldable_user: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract float @llvm.sqrt.f32(float %val0) + %div = fdiv contract float -1.0, %sqrt, !fpmath !0 + %user = fmul contract float %div, %val1 + ret float %user +} + +define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) { +; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v4, v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, v1, v4 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, 0.5, v4 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v4, v5, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v6, v5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v5, v1 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v6, v4 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v7, v4, v5 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v0 +; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v5, v0 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v6, 0x260 +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v6 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, v0, v5 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0.5, v5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v4, 0.5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v7, v4 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v4, v4, v0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v7, v5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v8, v5, v4 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v6 +; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x4f800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v0, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v1, v4, s[4:5] +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x4f800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0 +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0) + %div = fdiv contract <2 x float> , %sqrt, !fpmath !0 + %user = fmul contract <2 x float> %div, %val1 + ret <2 x float> %user +} + +define float @v_rsq_f32(float %val) { +; GCN-DAZ-LABEL: v_rsq_f32: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-SAFE-LABEL: v_rsq_f32: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, 0, 24, vcc +; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, 0, 12, vcc +; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1 + %div = fdiv contract float 1.0, %sqrt, !fpmath !1 + ret float %div +} + +define { float, float } @v_rsq_f32_multi_use(float %val) { +; GCN-DAZ-SAFE-LABEL: v_rsq_f32_multi_use: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v1, v0 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v2, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v2, vcc, 0, v2 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v1, v2 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v2, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v2, vcc, 0, v2 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v1, v2 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1 + %insert.0 = insertvalue { float, float } poison, float %sqrt, 0 + %div = fdiv contract float 1.0, %sqrt, !fpmath !1 + %insert.1 = insertvalue { float, float } %insert.0, float %div, 1 + ret { float, float } %insert.1 +} + +define float @v_rsq_f32_missing_contract0(float %val) { +; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract0: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call float @llvm.sqrt.f32(float %val), !fpmath !1 + %div = fdiv contract float 1.0, %sqrt, !fpmath !1 + ret float %div +} + +define float @v_rsq_f32_missing_contract1(float %val) { +; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract1: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1: +; SI-IEEE-SAFE: ; %bb.0: +; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; CI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1: +; CI-IEEE-SAFE: ; %bb.0: +; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 +; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 +; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 +; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1 + %div = fdiv float 1.0, %sqrt, !fpmath !1 + ret float %div +} + +; Test that we contract into FMA for an fadd user after introducing +; the fmul. +define float @v_rsq_f32_contractable_user(float %val0, float %val1) { +; GCN-DAZ-LABEL: v_rsq_f32_contractable_user: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc +; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x45800000 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v0, v0, v2, v1 +; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1 + %div = fdiv contract float 1.0, %sqrt, !fpmath !1 + %add = fadd contract float %div, %val1 + ret float %add +} + +; Missing contract on the fdiv +define float @v_rsq_f32_contractable_user_missing_contract0(float %val0, float %val1) { +; GCN-DAZ-LABEL: v_rsq_f32_contractable_user_missing_contract0: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract0: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc +; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x45800000 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v0, v0, v2, v1 +; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1 + %div = fdiv contract float 1.0, %sqrt, !fpmath !1 + %add = fadd contract float %div, %val1 + ret float %add +} + +; Missing contract on the fadd +define float @v_rsq_f32_contractable_user_missing_contract1(float %val0, float %val1) { +; GCN-DAZ-LABEL: v_rsq_f32_contractable_user_missing_contract1: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract1: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc +; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 12, vcc +; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GCN-IEEE-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1 + %div = fdiv contract float 1.0, %sqrt, !fpmath !1 + %add = fadd float %div, %val1 + ret float %add +} + +define float @v_rsq_f32_known_never_denormal(float nofpclass(sub) %val) { +; GCN-DAZ-LABEL: v_rsq_f32_known_never_denormal: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-LABEL: v_rsq_f32_known_never_denormal: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1 + %div = fdiv contract float 1.0, %sqrt, !fpmath !1 + ret float %div +} + +define float @v_rsq_f32_known_never_posdenormal(float nofpclass(psub) %val) { +; GCN-DAZ-LABEL: v_rsq_f32_known_never_posdenormal: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-SAFE-LABEL: v_rsq_f32_known_never_posdenormal: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, 0, 24, vcc +; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, 0, 12, vcc +; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1 + %div = fdiv contract float 1.0, %sqrt, !fpmath !1 + ret float %div +} + +!0 = !{float 2.500000e+00} +!1 = !{float 1.000000e+00} + +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CI-DAZ-SAFE: {{.*}} +; SI-DAZ-SAFE: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll index f4b947ade8dac..f7e0388561104 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll @@ -2,175 +2,51 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-UNSAFE,SI-DAZ-UNSAFE %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-UNSAFE,SI-IEEE-UNSAFE %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-SAFE,SI-DAZ-SAFE %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-SAFE,SI-IEEE-SAFE %s - ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-UNSAFE,CI-DAZ-UNSAFE %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-UNSAFE,CI-IEEE-UNSAFE %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-SAFE,CI-DAZ-SAFE %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-SAFE,CI-IEEE-SAFE %s - declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare float @llvm.sqrt.f32(float) nounwind readnone declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { -; GCN-DAZ-UNSAFE-LABEL: rsq_f32: -; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s11, s7 -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s8, s2 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s9, s3 -; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s4, s0 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s5, s1 -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-DAZ-UNSAFE-NEXT: s_endpgm -; -; GCN-IEEE-UNSAFE-LABEL: rsq_f32: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s11, s7 -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s8, s2 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s9, s3 -; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s4, s0 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s5, s1 -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-IEEE-UNSAFE-NEXT: s_endpgm -; -; GCN-DAZ-SAFE-LABEL: rsq_f32: -; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7 -; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3 -; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0xf800000 -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1 -; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 -; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-DAZ-SAFE-NEXT: s_endpgm -; -; SI-IEEE-SAFE-LABEL: rsq_f32: -; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 -; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 -; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 -; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 -; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 -; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-IEEE-SAFE-NEXT: s_endpgm +; GCN-DAZ-LABEL: rsq_f32: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-DAZ-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-NEXT: s_mov_b32 s6, -1 +; GCN-DAZ-NEXT: s_mov_b32 s10, s6 +; GCN-DAZ-NEXT: s_mov_b32 s11, s7 +; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-NEXT: s_mov_b32 s8, s2 +; GCN-DAZ-NEXT: s_mov_b32 s9, s3 +; GCN-DAZ-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-DAZ-NEXT: s_mov_b32 s4, s0 +; GCN-DAZ-NEXT: s_mov_b32 s5, s1 +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-DAZ-NEXT: s_endpgm ; -; CI-IEEE-SAFE-LABEL: rsq_f32: -; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 -; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 -; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 -; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 -; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 -; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 -; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 -; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; CI-IEEE-SAFE-NEXT: s_endpgm +; GCN-IEEE-LABEL: rsq_f32: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IEEE-NEXT: s_mov_b32 s6, -1 +; GCN-IEEE-NEXT: s_mov_b32 s10, s6 +; GCN-IEEE-NEXT: s_mov_b32 s11, s7 +; GCN-IEEE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IEEE-NEXT: s_mov_b32 s8, s2 +; GCN-IEEE-NEXT: s_mov_b32 s9, s3 +; GCN-IEEE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-IEEE-NEXT: s_mov_b32 s4, s0 +; GCN-IEEE-NEXT: s_mov_b32 s5, s1 +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IEEE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: rsq_f32: ; GCN-UNSAFE: ; %bb.0: ; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 @@ -196,131 +72,27 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( } define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %val) { -; GCN-DAZ-UNSAFE-LABEL: rsq_f32_sgpr: -; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dword s2, s[4:5], 0xb -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s3, 0xf000 -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s2, -1 -; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-DAZ-UNSAFE-NEXT: s_endpgm -; -; GCN-IEEE-UNSAFE-LABEL: rsq_f32_sgpr: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dword s2, s[4:5], 0xb -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s2, -1 -; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-IEEE-UNSAFE-NEXT: s_endpgm -; -; GCN-DAZ-SAFE-LABEL: rsq_f32_sgpr: -; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dword s0, s[4:5], 0xb -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, -1 -; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1 -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, s0 -; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 -; GCN-DAZ-SAFE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v3, v2 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v3, v1 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v2, v2, v0 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v3, v1, v2 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 -; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-DAZ-SAFE-NEXT: s_endpgm -; -; SI-IEEE-SAFE-LABEL: rsq_f32_sgpr: -; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dword s0, s[4:5], 0xb -; SI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1 -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v2, vcc, -1, v1 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc -; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 -; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x7f800000 -; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s0 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-IEEE-SAFE-NEXT: s_endpgm -; -; CI-IEEE-SAFE-LABEL: rsq_f32_sgpr: -; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dword s0, s[4:5], 0xb -; CI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 -; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1 -; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s0 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v2, vcc, -1, v1 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc -; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 -; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 -; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; CI-IEEE-SAFE-NEXT: s_endpgm +; GCN-DAZ-LABEL: rsq_f32_sgpr: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_load_dword s2, s[4:5], 0xb +; GCN-DAZ-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-DAZ-NEXT: s_mov_b32 s3, 0xf000 +; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, s2 +; GCN-DAZ-NEXT: s_mov_b32 s2, -1 +; GCN-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-DAZ-NEXT: s_endpgm +; +; GCN-IEEE-LABEL: rsq_f32_sgpr: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_load_dword s2, s[4:5], 0xb +; GCN-IEEE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-IEEE-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IEEE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, s2 +; GCN-IEEE-NEXT: s_mov_b32 s2, -1 +; GCN-IEEE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-IEEE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: rsq_f32_sgpr: ; GCN-UNSAFE: ; %bb.0: ; GCN-UNSAFE-NEXT: s_load_dword s2, s[0:1], 0xb @@ -365,154 +137,53 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-UNSAFE-NEXT: v_mul_f32_e32 v2, v4, v2 ; GCN-UNSAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; GCN-UNSAFE-NEXT: s_endpgm -; GCN-DAZ-UNSAFE-LABEL: rsqrt_fmul: -; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, 0 -; GCN-DAZ-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-DAZ-UNSAFE-NEXT: v_mov_b32_e32 v1, 0 -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: s_mov_b64 s[8:9], s[2:3] -; GCN-DAZ-UNSAFE-NEXT: s_mov_b64 s[10:11], s[6:7] -; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: s_mov_b64 s[4:5], s[0:1] -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v2, v2 -; GCN-DAZ-UNSAFE-NEXT: v_rcp_f32_e32 v3, v3 -; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e32 v2, v2, v3 -; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e32 v2, v4, v2 -; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 -; GCN-DAZ-UNSAFE-NEXT: s_endpgm -; -; GCN-IEEE-UNSAFE-LABEL: rsqrt_fmul: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, 0 -; GCN-IEEE-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-IEEE-UNSAFE-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: s_mov_b64 s[8:9], s[2:3] -; GCN-IEEE-UNSAFE-NEXT: s_mov_b64 s[10:11], s[6:7] -; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: s_mov_b64 s[4:5], s[0:1] -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v2, v2 -; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e32 v3, v3 -; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v2, v2, v3 -; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v2, v4, v2 -; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 -; GCN-IEEE-UNSAFE-NEXT: s_endpgm -; -; GCN-DAZ-SAFE-LABEL: rsqrt_fmul: -; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0 -; GCN-DAZ-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v1, 0 -; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[10:11], s[2:3] -; GCN-DAZ-SAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-SAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc -; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-SAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc -; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s0, 0xf800000 -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v6, 0x260 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v2 -; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v5, v2 -; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[0:1], s[4:5] -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, v2, v5 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0.5, v5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v5, v7, 0.5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, v7, v8, v7 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v8, v5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v7, v7, v2 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v8, v5, v7 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, 0x37800000, v5 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v2, v6 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v2, v3 -; GCN-DAZ-SAFE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v4 -; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v5, v3 -; GCN-DAZ-SAFE-NEXT: v_div_scale_f32 v6, vcc, v4, v2, v4 -; GCN-DAZ-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v7, v5, v5 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, v6, v5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v3, v7, v6 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, v8, v5, v7 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v3, v7, v6 -; GCN-DAZ-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GCN-DAZ-SAFE-NEXT: v_div_fmas_f32 v3, v3, v5, v7 -; GCN-DAZ-SAFE-NEXT: v_div_fixup_f32 v2, v3, v2, v4 -; GCN-DAZ-SAFE-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; GCN-DAZ-SAFE-NEXT: s_endpgm -; -; GCN-IEEE-SAFE-LABEL: rsqrt_fmul: -; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, 0 -; GCN-IEEE-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[10:11], s[2:3] -; GCN-IEEE-SAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-IEEE-SAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc -; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-IEEE-SAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc -; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v6, 0x260 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v2 -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v5, v2 -; GCN-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[0:1], -1, v5 -; GCN-IEEE-SAFE-NEXT: v_add_i32_e64 v8, s[0:1], 1, v5 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v9, -v7, v5, v2 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v10, -v8, v5, v2 -; GCN-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v9 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[0:1] -; GCN-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v10 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[0:1] -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v7, 0x37800000, v5 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GCN-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v2, v6 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, v2, v3 -; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, v4 -; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v5, v3 -; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v6, vcc, v4, v2, v4 -; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[0:1], s[4:5] -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, v7, v5, v5 -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v7, v6, v5 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v8, -v3, v7, v6 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v7, v8, v5, v7 -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v3, -v3, v7, v6 -; GCN-IEEE-SAFE-NEXT: v_div_fmas_f32 v3, v3, v5, v7 -; GCN-IEEE-SAFE-NEXT: v_div_fixup_f32 v2, v3, v2, v4 -; GCN-IEEE-SAFE-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; GCN-IEEE-SAFE-NEXT: s_endpgm +; GCN-DAZ-LABEL: rsqrt_fmul: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-DAZ-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-NEXT: s_mov_b32 s6, 0 +; GCN-DAZ-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-DAZ-NEXT: v_mov_b32_e32 v1, 0 +; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-DAZ-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-DAZ-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-DAZ-NEXT: v_rsq_f32_e32 v2, v2 +; GCN-DAZ-NEXT: v_rcp_f32_e32 v3, v3 +; GCN-DAZ-NEXT: v_mul_f32_e32 v2, v2, v3 +; GCN-DAZ-NEXT: v_mul_f32_e32 v2, v4, v2 +; GCN-DAZ-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-DAZ-NEXT: s_endpgm +; +; GCN-IEEE-LABEL: rsqrt_fmul: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IEEE-NEXT: s_mov_b32 s6, 0 +; GCN-IEEE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-IEEE-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IEEE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IEEE-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-IEEE-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-IEEE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-IEEE-NEXT: v_rsq_f32_e32 v2, v2 +; GCN-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GCN-IEEE-NEXT: v_mul_f32_e32 v2, v2, v3 +; GCN-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2 +; GCN-IEEE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-IEEE-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -525,167 +196,49 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i %x = call contract float @llvm.sqrt.f32(float %a) %y = fmul contract float %x, %b - %z = fdiv contract float %c, %y + %z = fdiv arcp contract float %c, %y store float %z, ptr addrspace(1) %out.gep ret void } define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { -; GCN-DAZ-UNSAFE-LABEL: neg_rsq_f32: -; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s11, s7 -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s8, s2 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s9, s3 -; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s4, s0 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s5, s1 -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-DAZ-UNSAFE-NEXT: s_endpgm -; -; GCN-IEEE-UNSAFE-LABEL: neg_rsq_f32: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s11, s7 -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s8, s2 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s9, s3 -; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s4, s0 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s5, s1 -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-IEEE-UNSAFE-NEXT: s_endpgm -; -; GCN-DAZ-SAFE-LABEL: neg_rsq_f32: -; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7 -; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3 -; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0xf800000 -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1 -; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 -; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-DAZ-SAFE-NEXT: s_endpgm -; -; SI-IEEE-SAFE-LABEL: neg_rsq_f32: -; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 -; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 -; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 -; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 -; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 -; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[0:1] -; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-IEEE-SAFE-NEXT: s_endpgm +; GCN-DAZ-LABEL: neg_rsq_f32: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-DAZ-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-NEXT: s_mov_b32 s6, -1 +; GCN-DAZ-NEXT: s_mov_b32 s10, s6 +; GCN-DAZ-NEXT: s_mov_b32 s11, s7 +; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-NEXT: s_mov_b32 s8, s2 +; GCN-DAZ-NEXT: s_mov_b32 s9, s3 +; GCN-DAZ-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-DAZ-NEXT: s_mov_b32 s4, s0 +; GCN-DAZ-NEXT: s_mov_b32 s5, s1 +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-DAZ-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-DAZ-NEXT: s_endpgm ; -; CI-IEEE-SAFE-LABEL: neg_rsq_f32: -; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 -; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 -; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 -; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 -; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 -; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 -; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 -; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; CI-IEEE-SAFE-NEXT: s_endpgm +; GCN-IEEE-LABEL: neg_rsq_f32: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IEEE-NEXT: s_mov_b32 s6, -1 +; GCN-IEEE-NEXT: s_mov_b32 s10, s6 +; GCN-IEEE-NEXT: s_mov_b32 s11, s7 +; GCN-IEEE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IEEE-NEXT: s_mov_b32 s8, s2 +; GCN-IEEE-NEXT: s_mov_b32 s9, s3 +; GCN-IEEE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-IEEE-NEXT: s_mov_b32 s4, s0 +; GCN-IEEE-NEXT: s_mov_b32 s5, s1 +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IEEE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: neg_rsq_f32: ; GCN-UNSAFE: ; %bb.0: ; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 @@ -712,161 +265,43 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp } define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { -; GCN-DAZ-UNSAFE-LABEL: neg_rsq_neg_f32: -; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s11, s7 -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s8, s2 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s9, s3 -; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s4, s0 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s5, s1 -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 -; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-DAZ-UNSAFE-NEXT: s_endpgm -; -; GCN-IEEE-UNSAFE-LABEL: neg_rsq_neg_f32: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s11, s7 -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s8, s2 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s9, s3 -; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s4, s0 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s5, s1 -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 -; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-IEEE-UNSAFE-NEXT: s_endpgm -; -; GCN-DAZ-SAFE-LABEL: neg_rsq_neg_f32: -; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7 -; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3 -; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0x8f800000 -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1 -; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0 -; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-DAZ-SAFE-NEXT: s_endpgm -; -; SI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: -; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 -; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 -; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x8f800000 -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 -; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, s[0:1] -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 -; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 -; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[0:1] -; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-IEEE-SAFE-NEXT: s_endpgm -; -; CI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: -; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 -; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 -; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x8f800000 -; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 -; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, s[0:1] -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 -; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 -; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 -; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; CI-IEEE-SAFE-NEXT: s_endpgm +; GCN-DAZ-LABEL: neg_rsq_neg_f32: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-DAZ-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-NEXT: s_mov_b32 s6, -1 +; GCN-DAZ-NEXT: s_mov_b32 s10, s6 +; GCN-DAZ-NEXT: s_mov_b32 s11, s7 +; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-NEXT: s_mov_b32 s8, s2 +; GCN-DAZ-NEXT: s_mov_b32 s9, s3 +; GCN-DAZ-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-DAZ-NEXT: s_mov_b32 s4, s0 +; GCN-DAZ-NEXT: s_mov_b32 s5, s1 +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-DAZ-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-DAZ-NEXT: s_endpgm +; +; GCN-IEEE-LABEL: neg_rsq_neg_f32: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IEEE-NEXT: s_mov_b32 s6, -1 +; GCN-IEEE-NEXT: s_mov_b32 s10, s6 +; GCN-IEEE-NEXT: s_mov_b32 s11, s7 +; GCN-IEEE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IEEE-NEXT: s_mov_b32 s8, s2 +; GCN-IEEE-NEXT: s_mov_b32 s9, s3 +; GCN-IEEE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-IEEE-NEXT: s_mov_b32 s4, s0 +; GCN-IEEE-NEXT: s_mov_b32 s5, s1 +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-IEEE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IEEE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: neg_rsq_neg_f32: ; GCN-UNSAFE: ; %bb.0: ; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 @@ -894,101 +329,19 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad } define float @v_neg_rsq_neg_f32(float %val) { -; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_f32: -; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 -; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_f32: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 -; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32: -; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0 -; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v3, v2 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v2, v0 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v3, v1 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v2 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 -; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32: -; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 -; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 -; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5] -; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-LABEL: v_neg_rsq_neg_f32: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] ; -; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32: -; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 -; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 -; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-IEEE-LABEL: v_neg_rsq_neg_f32: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-IEEE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] %val.fneg = fneg float %val %sqrt = call contract float @llvm.sqrt.f32(float %val.fneg) %div = fdiv contract float -1.0, %sqrt, !fpmath !0 @@ -996,168 +349,23 @@ define float @v_neg_rsq_neg_f32(float %val) { } define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) { -; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_v2f32: -; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v1, -v1 -; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_v2f32: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v1, -v1 -; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32: -; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, 0x4f800000 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v2, -v1, s5 -; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v2, vcc -; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v1 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v1, v2 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v1 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v3, -v0, s5 -; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v3, vcc -; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v3, v0 -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v4, 0x260 -; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v4 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v3 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0.5, v3 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v2, 0.5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v5, v2 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v2, v2, v0 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v5, v3 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v6, v3, v2 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1 -; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32: -; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000 -; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v2, -v1, s7 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v2, vcc -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1 -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1 -; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v0, s7 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v4, vcc -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0 -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 -; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000 -; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5] -; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 -; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 -; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v1, v2, s[4:5] -; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 -; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 -; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 -; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1 -; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-LABEL: v_neg_rsq_neg_v2f32: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: v_rsq_f32_e64 v1, -v1 +; GCN-DAZ-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-DAZ-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] ; -; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32: -; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000 -; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v2, -v1, s7 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v2, vcc -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1 -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1 -; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v0, s7 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v4, vcc -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0 -; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 -; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 -; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 -; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 -; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1 -; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 -; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 -; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 -; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1 -; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-IEEE-LABEL: v_neg_rsq_neg_v2f32: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-IEEE-NEXT: v_rsq_f32_e64 v1, -v1 +; GCN-IEEE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-IEEE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] %val.fneg = fneg <2 x float> %val %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val.fneg) %div = fdiv contract <2 x float> , %sqrt, !fpmath !0 @@ -1165,104 +373,19 @@ define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) { } define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) { -; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: -; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 -; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v1 -; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 -; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v1 -; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: -; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 -; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc -; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v0 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v2 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v0 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 -; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: -; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 -; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 -; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5] -; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 -; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-LABEL: v_neg_rsq_neg_f32_foldable_user: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] ; -; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: -; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 -; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 -; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 -; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 -; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-IEEE-LABEL: v_neg_rsq_neg_f32_foldable_user: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] %val0.neg = fneg float %val0 %sqrt = call contract float @llvm.sqrt.f32(float %val0.neg) %div = fdiv contract float -1.0, %sqrt, !fpmath !0 @@ -1271,546 +394,86 @@ define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) { } define <2 x float> @v_neg_rsq_neg_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) { -; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: -; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v1, -v1 -; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v2 -; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v1, -v1, v3 -; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v1, -v1 -; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v2 -; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v1, -v1, v3 -; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: -; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, 0x4f800000 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v4, -v1, s5 -; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v4, vcc -; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v4, v1 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, v1, v4 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, 0.5, v4 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v4, v5, 0.5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v6, v5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v5, v1 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v6, v4 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v7, v4, v5 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v5, -v0, s5 -; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v5, vcc -; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v5, v0 -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v6, 0x260 -; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v6 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, v0, v5 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0.5, v5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v4, 0.5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v7, v4 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v4, v4, v0 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v7, v5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v8, v5, v4 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v6 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 -; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: -; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000 -; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v1, s7 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v4, vcc -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1 -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1 -; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5] -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5] -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v6, -v0, s7 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v6, vcc -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0 -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260 -; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000 -; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v0, v4, s[4:5] -; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 -; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0 -; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v1, v4, s[4:5] -; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 -; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 -; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 -; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1 -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 -; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: -; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000 -; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v1, s7 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v4, vcc -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1 -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1 -; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5] -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5] -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v6, -v0, s7 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v6, vcc -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0 -; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260 -; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0 -; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 -; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0 -; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1 -; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4 -; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 -; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 -; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1 -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 -; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] - %val0.fneg = fneg <2 x float> %val0 - %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0.fneg) - %div = fdiv contract <2 x float> , %sqrt, !fpmath !0 - %user = fmul contract <2 x float> %div, %val1 - ret <2 x float> %user -} - -define float @v_neg_rsq_f32(float %val) { -; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_f32: -; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_f32: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32: -; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 -; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v3, v2 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v2, v0 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v3, v1 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v2 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 -; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32: -; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 -; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 -; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5] -; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32: -; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 -; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0 -; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-LABEL: v_neg_rsq_neg_v2f32_foldable_user: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: v_rsq_f32_e64 v1, -v1 +; GCN-DAZ-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GCN-DAZ-NEXT: v_mul_f32_e64 v1, -v1, v3 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e64 v0, -v0 +; GCN-IEEE-NEXT: v_rsq_f32_e64 v1, -v1 +; GCN-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GCN-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v3 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] + %val0.fneg = fneg <2 x float> %val0 + %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0.fneg) + %div = fdiv contract <2 x float> , %sqrt, !fpmath !0 + %user = fmul contract <2 x float> %div, %val1 + ret <2 x float> %user +} + +define float @v_neg_rsq_f32(float %val) { +; GCN-DAZ-LABEL: v_neg_rsq_f32: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-LABEL: v_neg_rsq_f32: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %val) %div = fdiv contract float -1.0, %sqrt, !fpmath !0 ret float %div } define <2 x float> @v_neg_rsq_v2f32(<2 x float> %val) { -; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_v2f32: -; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v1, v1 -; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_v2f32: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v1, v1 -; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32: -; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v1 -; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v1 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v1, v2 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v1 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x4f800000, v0 -; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v3, v0 -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v4, 0x260 -; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v4 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v3 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0.5, v3 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v2, 0.5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v5, v2 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v2, v2, v0 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v5, v3 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v6, v3, v2 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1 -; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32: -; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000 -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v1 -; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1 -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1 -; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0 -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 -; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000 -; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5] -; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 -; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 -; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v1, v2, s[4:5] -; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 -; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 -; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 -; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1 -; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-LABEL: v_neg_rsq_v2f32: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_rsq_f32_e32 v1, v1 +; GCN-DAZ-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-DAZ-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] ; -; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32: -; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000 -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v1 -; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1 -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1 -; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0 -; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 -; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5] -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5] -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 -; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 -; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 -; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1 -; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 -; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 -; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1 -; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1 -; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-IEEE-LABEL: v_neg_rsq_v2f32: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-NEXT: v_rsq_f32_e32 v1, v1 +; GCN-IEEE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-IEEE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val) %div = fdiv contract <2 x float> , %sqrt, !fpmath !0 ret <2 x float> %div } define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) { -; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_f32_foldable_user: -; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v1 -; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_f32_foldable_user: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v1 -; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32_foldable_user: -; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 -; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v0 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v2 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v0 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 -; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user: -; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 -; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 -; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5] -; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 -; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-LABEL: v_neg_rsq_f32_foldable_user: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] ; -; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user: -; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5] -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260 -; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0 -; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2 -; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0 -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 -; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-IEEE-LABEL: v_neg_rsq_f32_foldable_user: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %val0) %div = fdiv contract float -1.0, %sqrt, !fpmath !0 %user = fmul contract float %div, %val1 @@ -1818,24 +481,23 @@ define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) { } define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) { -; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_v2f32_foldable_user: -; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v1, v1 -; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v2 -; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v1, -v1, v3 -; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_v2f32_foldable_user: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v1, v1 -; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v2 -; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v1, -v1, v3 -; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-LABEL: v_neg_rsq_v2f32_foldable_user: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_rsq_f32_e32 v1, v1 +; GCN-DAZ-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GCN-DAZ-NEXT: v_mul_f32_e64 v1, -v1, v3 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] ; +; GCN-IEEE-LABEL: v_neg_rsq_v2f32_foldable_user: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-NEXT: v_rsq_f32_e32 v1, v1 +; GCN-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GCN-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v3 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user: ; GCN-DAZ-SAFE: ; %bb.0: ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1876,7 +538,6 @@ define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float> ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] -; ; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1932,7 +593,6 @@ define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float> ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] -; ; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user: ; CI-IEEE-SAFE: ; %bb.0: ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1996,12 +656,11 @@ define float @v_rsq_f32(float %val) { ; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 ; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] ; -; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; +; GCN-IEEE-LABEL: v_rsq_f32: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] ; GCN-IEEE-SAFE-LABEL: v_rsq_f32: ; GCN-IEEE-SAFE: ; %bb.0: ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2019,29 +678,27 @@ define float @v_rsq_f32(float %val) { } define { float, float } @v_rsq_f32_multi_use(float %val) { -; GCN-DAZ-UNSAFE-LABEL: v_rsq_f32_multi_use: -; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_sqrt_f32_e32 v2, v0 -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v1, v0 -; GCN-DAZ-UNSAFE-NEXT: v_mov_b32_e32 v0, v2 -; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_multi_use: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v2, v0 -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v1, v0 -; GCN-IEEE-UNSAFE-NEXT: v_mov_b32_e32 v0, v2 -; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-LABEL: v_rsq_f32_multi_use: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v2, v0 +; GCN-DAZ-NEXT: v_rsq_f32_e32 v1, v0 +; GCN-DAZ-NEXT: v_mov_b32_e32 v0, v2 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] ; +; GCN-IEEE-LABEL: v_rsq_f32_multi_use: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_sqrt_f32_e32 v2, v0 +; GCN-IEEE-NEXT: v_rsq_f32_e32 v1, v0 +; GCN-IEEE-NEXT: v_mov_b32_e32 v0, v2 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] ; GCN-DAZ-SAFE-LABEL: v_rsq_f32_multi_use: ; GCN-DAZ-SAFE: ; %bb.0: ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] -; ; SI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2072,7 +729,6 @@ define { float, float } @v_rsq_f32_multi_use(float %val) { ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v2, vcc, 0, v2 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] -; ; CI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use: ; CI-IEEE-SAFE: ; %bb.0: ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2102,31 +758,29 @@ define { float, float } @v_rsq_f32_multi_use(float %val) { ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1 %insert.0 = insertvalue { float, float } poison, float %sqrt, 0 - %div = fdiv contract float 1.0, %sqrt, !fpmath !1 + %div = fdiv arcp contract float 1.0, %sqrt, !fpmath !1 %insert.1 = insertvalue { float, float } %insert.0, float %div, 1 ret { float, float } %insert.1 } define float @v_rsq_f32_missing_contract0(float %val) { -; GCN-DAZ-UNSAFE-LABEL: v_rsq_f32_missing_contract0: -; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_missing_contract0: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-LABEL: v_rsq_f32_missing_contract0: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] ; +; GCN-IEEE-LABEL: v_rsq_f32_missing_contract0: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] ; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract0: ; GCN-DAZ-SAFE: ; %bb.0: ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] -; ; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2157,7 +811,6 @@ define float @v_rsq_f32_missing_contract0(float %val) { ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] -; ; CI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0: ; CI-IEEE-SAFE: ; %bb.0: ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2186,90 +839,24 @@ define float @v_rsq_f32_missing_contract0(float %val) { ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %sqrt = call float @llvm.sqrt.f32(float %val), !fpmath !1 - %div = fdiv contract float 1.0, %sqrt, !fpmath !1 + %div = fdiv arcp contract float 1.0, %sqrt, !fpmath !1 ret float %div } define float @v_rsq_f32_missing_contract1(float %val) { -; GCN-DAZ-UNSAFE-LABEL: v_rsq_f32_missing_contract1: -; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_missing_contract1: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract1: -; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 -; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] -; -; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1: -; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] -; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 -; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000 -; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-LABEL: v_rsq_f32_missing_contract1: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] ; -; CI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1: -; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000 -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] -; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 -; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 -; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 -; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-IEEE-LABEL: v_rsq_f32_missing_contract1: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1 - %div = fdiv float 1.0, %sqrt, !fpmath !1 + %div = fdiv arcp float 1.0, %sqrt, !fpmath !1 ret float %div } @@ -2283,25 +870,12 @@ define float @v_rsq_f32_contractable_user(float %val0, float %val1) { ; GCN-DAZ-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] ; -; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_contractable_user: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-UNSAFE-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user: -; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000 -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc -; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x45800000 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v0, v0, v2, v1 -; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-IEEE-LABEL: v_rsq_f32_contractable_user: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1 %div = fdiv contract float 1.0, %sqrt, !fpmath !1 %add = fadd contract float %div, %val1 @@ -2317,25 +891,12 @@ define float @v_rsq_f32_contractable_user_missing_contract0(float %val0, float % ; GCN-DAZ-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] ; -; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_contractable_user_missing_contract0: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-UNSAFE-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract0: -; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000 -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc -; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x45800000 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GCN-IEEE-SAFE-NEXT: v_fma_f32 v0, v0, v2, v1 -; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-IEEE-LABEL: v_rsq_f32_contractable_user_missing_contract0: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1 %div = fdiv contract float 1.0, %sqrt, !fpmath !1 %add = fadd contract float %div, %val1 @@ -2351,25 +912,12 @@ define float @v_rsq_f32_contractable_user_missing_contract1(float %val0, float % ; GCN-DAZ-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] ; -; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_contractable_user_missing_contract1: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-UNSAFE-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract1: -; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000 -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc -; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 12, vcc -; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GCN-IEEE-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-IEEE-LABEL: v_rsq_f32_contractable_user_missing_contract1: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1 %div = fdiv contract float 1.0, %sqrt, !fpmath !1 %add = fadd float %div, %val1 @@ -2400,23 +948,11 @@ define float @v_rsq_f32_known_never_posdenormal(float nofpclass(psub) %val) { ; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 ; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] ; -; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_known_never_posdenormal: -; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; -; GCN-IEEE-SAFE-LABEL: v_rsq_f32_known_never_posdenormal: -; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000 -; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, 0, 24, vcc -; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, 0, 12, vcc -; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] +; GCN-IEEE-LABEL: v_rsq_f32_known_never_posdenormal: +; GCN-IEEE: ; %bb.0: +; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1 %div = fdiv contract float 1.0, %sqrt, !fpmath !1 ret float %div @@ -2427,9 +963,9 @@ define float @v_rsq_f32_known_never_posdenormal(float nofpclass(psub) %val) { attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CI-DAZ-SAFE: {{.*}} ; CI-DAZ-UNSAFE: {{.*}} ; CI-IEEE-UNSAFE: {{.*}} -; SI-DAZ-SAFE: {{.*}} +; GCN-DAZ-UNSAFE: {{.*}} +; GCN-IEEE-UNSAFE: {{.*}} ; SI-DAZ-UNSAFE: {{.*}} ; SI-IEEE-UNSAFE: {{.*}} diff --git a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll index a28d264cd8ec0..3989c8e32e458 100644 --- a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll +++ b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll @@ -43,7 +43,7 @@ define float @test_rsqrt_ftz(float %a) #0 #1 { ret float %ret } -define double @test_rsqrt64(double %a) #0 { +define double @test_rsqrt64(double %a) { ; CHECK-LABEL: test_rsqrt64( ; CHECK: { ; CHECK-NEXT: .reg .b64 %rd<3>; @@ -54,12 +54,12 @@ define double @test_rsqrt64(double %a) #0 { ; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; ; CHECK-NEXT: ret; %val = tail call double @llvm.sqrt.f64(double %a) - %ret = fdiv double 1.0, %val + %ret = fdiv arcp double 1.0, %val ret double %ret } ; There's no rsqrt.approx.ftz.f64 instruction; we just use the non-ftz version. -define double @test_rsqrt64_ftz(double %a) #0 #1 { +define double @test_rsqrt64_ftz(double %a) #1 { ; CHECK-LABEL: test_rsqrt64_ftz( ; CHECK: { ; CHECK-NEXT: .reg .b64 %rd<3>; @@ -70,7 +70,7 @@ define double @test_rsqrt64_ftz(double %a) #0 #1 { ; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; ; CHECK-NEXT: ret; %val = tail call double @llvm.sqrt.f64(double %a) - %ret = fdiv double 1.0, %val + %ret = fdiv arcp double 1.0, %val ret double %ret } @@ -229,7 +229,7 @@ define float @test_rsqrt32_refined(float %a) #0 #2 { ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; %val = tail call float @llvm.sqrt.f32(float %a) - %ret = fdiv float 1.0, %val + %ret = fdiv arcp float 1.0, %val ret float %ret } @@ -284,7 +284,7 @@ define double @test_rsqrt64_refined(double %a) #0 #2 { ; CHECK-NEXT: st.param.b64 [func_retval0], %rd6; ; CHECK-NEXT: ret; %val = tail call double @llvm.sqrt.f64(double %a) - %ret = fdiv double 1.0, %val + %ret = fdiv arcp double 1.0, %val ret double %ret } @@ -341,7 +341,7 @@ define float @test_rsqrt32_refined_ftz(float %a) #0 #1 #2 { ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; %val = tail call float @llvm.sqrt.f32(float %a) - %ret = fdiv float 1.0, %val + %ret = fdiv arcp float 1.0, %val ret float %ret } @@ -396,7 +396,7 @@ define double @test_rsqrt64_refined_ftz(double %a) #0 #1 #2 { ; CHECK-NEXT: st.param.b64 [func_retval0], %rd6; ; CHECK-NEXT: ret; %val = tail call double @llvm.sqrt.f64(double %a) - %ret = fdiv double 1.0, %val + %ret = fdiv arcp double 1.0, %val ret double %ret } diff --git a/llvm/test/CodeGen/X86/change-unsafe-fp-math.ll b/llvm/test/CodeGen/X86/change-unsafe-fp-math.ll index ba09ba8b6402b..2aa79fafe59a5 100644 --- a/llvm/test/CodeGen/X86/change-unsafe-fp-math.ll +++ b/llvm/test/CodeGen/X86/change-unsafe-fp-math.ll @@ -4,7 +4,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=SAFE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -enable-unsafe-fp-math \ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown \ ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=UNSAFE ; The div in these functions should be converted to a mul when unsafe-fp-math @@ -12,14 +12,19 @@ ; CHECK-LABEL: unsafe_fp_math_default0: define double @unsafe_fp_math_default0(double %x) { -; SAFE: divsd ; UNSAFE: mulsd + %div = fdiv arcp double %x, 3.0 + ret double %div +} +; CHECK-LABEL: safe_fp_math_default0: +define double @safe_fp_math_default0(double %x) { +; SAFE: divsd %div = fdiv double %x, 3.0 ret double %div } ; CHECK-LABEL: unsafe_fp_math_off: -define double @unsafe_fp_math_off(double %x) #0 { +define double @unsafe_fp_math_off(double %x) { ; SAFE: divsd ; UNSAFE: divsd %div = fdiv double %x, 3.0 @@ -29,28 +34,37 @@ define double @unsafe_fp_math_off(double %x) #0 { ; CHECK-LABEL: unsafe_fp_math_default1: define double @unsafe_fp_math_default1(double %x) { ; With unsafe math enabled, can change this div to a mul. -; SAFE: divsd ; UNSAFE: mulsd + %div = fdiv arcp double %x, 3.0 + ret double %div +} +; CHECK-LABEL: safe_fp_math_default1: +define double @safe_fp_math_default1(double %x) { +; With unsafe math enabled, can change this div to a mul. +; SAFE: divsd %div = fdiv double %x, 3.0 ret double %div } ; CHECK-LABEL: unsafe_fp_math_on: -define double @unsafe_fp_math_on(double %x) #1 { +define double @unsafe_fp_math_on(double %x) { ; SAFE: mulsd ; UNSAFE: mulsd - %div = fdiv double %x, 3.0 + %div = fdiv arcp double %x, 3.0 ret double %div } ; CHECK-LABEL: unsafe_fp_math_default2: define double @unsafe_fp_math_default2(double %x) { ; With unsafe math enabled, can change this div to a mul. -; SAFE: divsd ; UNSAFE: mulsd + %div = fdiv arcp double %x, 3.0 + ret double %div +} +; CHECK-LABEL: safe_fp_math_default2: +define double @safe_fp_math_default2(double %x) { +; With unsafe math enabled, can change this div to a mul. +; SAFE: divsd %div = fdiv double %x, 3.0 ret double %div } - -attributes #0 = { "unsafe-fp-math"="false" } -attributes #1 = { "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/X86/fdiv.ll b/llvm/test/CodeGen/X86/fdiv.ll index 6d2db80a87bdf..67bad09d5dd7f 100644 --- a/llvm/test/CodeGen/X86/fdiv.ll +++ b/llvm/test/CodeGen/X86/fdiv.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s define double @exact(double %x) { ; Exact division by a constant converted to multiplication. @@ -17,7 +17,7 @@ define double @inexact(double %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %div = fdiv double %x, 0x41DFFFFFFFC00000 + %div = fdiv arcp double %x, 0x41DFFFFFFFC00000 ret double %div } From 79931a4d25e66141d721727d3e22a9601aa202eb Mon Sep 17 00:00:00 2001 From: PaperChalice Date: Thu, 26 Jun 2025 15:32:42 +0800 Subject: [PATCH 4/5] preserve fast-math flags when lowering fdiv --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 15 +++++---- llvm/test/CodeGen/NVPTX/frem.ll | 37 ++++++++++++++++++--- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index d2fafe854e9e4..9b43c6e326bf2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -2857,15 +2857,16 @@ static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG, SDValue X = Op->getOperand(0); SDValue Y = Op->getOperand(1); EVT Ty = Op.getValueType(); + SDNodeFlags Flags = Op->getFlags(); - SDValue Div = DAG.getNode(ISD::FDIV, DL, Ty, X, Y); - SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, Ty, Div); - SDValue Mul = - DAG.getNode(ISD::FMUL, DL, Ty, Trunc, Y, SDNodeFlags::AllowContract); - SDValue Sub = - DAG.getNode(ISD::FSUB, DL, Ty, X, Mul, SDNodeFlags::AllowContract); + SDValue Div = DAG.getNode(ISD::FDIV, DL, Ty, X, Y, Flags); + SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, Ty, Div, Flags); + SDValue Mul = DAG.getNode(ISD::FMUL, DL, Ty, Trunc, Y, + Flags | SDNodeFlags::AllowContract); + SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul, + Flags | SDNodeFlags::AllowContract); - if (AllowUnsafeFPMath || Op->getFlags().hasNoInfs()) + if (AllowUnsafeFPMath || Flags.hasNoInfs()) return Sub; // If Y is infinite, return X diff --git a/llvm/test/CodeGen/NVPTX/frem.ll b/llvm/test/CodeGen/NVPTX/frem.ll index 909f2534f8219..5805aed1bebe6 100644 --- a/llvm/test/CodeGen/NVPTX/frem.ll +++ b/llvm/test/CodeGen/NVPTX/frem.ll @@ -222,25 +222,52 @@ define double @frem_f64_ninf(double %a, double %b) { ret double %r } -define float @frem_f32_imm1(float %a) { -; FAST-LABEL: frem_f32_imm1( +define float @frem_f32_imm1_fast(float %a) { +; FAST-LABEL: frem_f32_imm1_fast( ; FAST: { ; FAST-NEXT: .reg .b32 %r<5>; ; FAST-EMPTY: ; FAST-NEXT: // %bb.0: -; FAST-NEXT: ld.param.b32 %r1, [frem_f32_imm1_param_0]; +; FAST-NEXT: ld.param.b32 %r1, [frem_f32_imm1_fast_param_0]; ; FAST-NEXT: mul.f32 %r2, %r1, 0f3E124925; ; FAST-NEXT: cvt.rzi.f32.f32 %r3, %r2; ; FAST-NEXT: fma.rn.f32 %r4, %r3, 0fC0E00000, %r1; ; FAST-NEXT: st.param.b32 [func_retval0], %r4; ; FAST-NEXT: ret; ; -; NORMAL-LABEL: frem_f32_imm1( +; NORMAL-LABEL: frem_f32_imm1_fast( ; NORMAL: { ; NORMAL-NEXT: .reg .b32 %r<5>; ; NORMAL-EMPTY: ; NORMAL-NEXT: // %bb.0: -; NORMAL-NEXT: ld.param.b32 %r1, [frem_f32_imm1_param_0]; +; NORMAL-NEXT: ld.param.b32 %r1, [frem_f32_imm1_fast_param_0]; +; NORMAL-NEXT: mul.rn.f32 %r2, %r1, 0f3E124925; +; NORMAL-NEXT: cvt.rzi.f32.f32 %r3, %r2; +; NORMAL-NEXT: fma.rn.f32 %r4, %r3, 0fC0E00000, %r1; +; NORMAL-NEXT: st.param.b32 [func_retval0], %r4; +; NORMAL-NEXT: ret; + %r = frem arcp float %a, 7.0 + ret float %r +} +define float @frem_f32_imm1_normal(float %a) { +; FAST-LABEL: frem_f32_imm1_normal( +; FAST: { +; FAST-NEXT: .reg .b32 %r<5>; +; FAST-EMPTY: +; FAST-NEXT: // %bb.0: +; FAST-NEXT: ld.param.b32 %r1, [frem_f32_imm1_normal_param_0]; +; FAST-NEXT: div.approx.f32 %r2, %r1, 0f40E00000; +; FAST-NEXT: cvt.rzi.f32.f32 %r3, %r2; +; FAST-NEXT: fma.rn.f32 %r4, %r3, 0fC0E00000, %r1; +; FAST-NEXT: st.param.b32 [func_retval0], %r4; +; FAST-NEXT: ret; +; +; NORMAL-LABEL: frem_f32_imm1_normal( +; NORMAL: { +; NORMAL-NEXT: .reg .b32 %r<5>; +; NORMAL-EMPTY: +; NORMAL-NEXT: // %bb.0: +; NORMAL-NEXT: ld.param.b32 %r1, [frem_f32_imm1_normal_param_0]; ; NORMAL-NEXT: div.rn.f32 %r2, %r1, 0f40E00000; ; NORMAL-NEXT: cvt.rzi.f32.f32 %r3, %r2; ; NORMAL-NEXT: fma.rn.f32 %r4, %r3, 0fC0E00000, %r1; From 5f72ac3c2658be92379c1c0d5ad29e0471cc87b2 Mon Sep 17 00:00:00 2001 From: PaperChalice Date: Fri, 27 Jun 2025 16:25:52 +0800 Subject: [PATCH 5/5] Remove `isReassociable` by checking flags directly --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 6c7b1499664b7..b7b65f2ea1b39 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17166,13 +17166,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { } } - auto isReassociable = [&Options](SDNode *N) { - return N->getFlags().hasAllowReassociation(); - }; - - auto isContractableAndReassociableFMUL = [&isContractableFMUL, - &isReassociable](SDValue N) { - return isContractableFMUL(N) && isReassociable(N.getNode()); + auto isContractableAndReassociableFMUL = [&isContractableFMUL](SDValue N) { + return isContractableFMUL(N) && N->getFlags().hasAllowReassociation(); }; auto isFusedOp = [&](SDValue N) { @@ -17180,7 +17175,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { }; // More folding opportunities when target permits. - if (Aggressive && isReassociable(N)) { + if (Aggressive && N->getFlags().hasAllowReassociation()) { bool CanFuse = N->getFlags().hasAllowContract(); // fold (fsub (fma x, y, (fmul u, v)), z) // -> (fma x, y (fma u, v, (fneg z)))