[AArch64][GISel] Fix lowering of fp16 intrinsics (#130156)

sdesmalen-arm · web-flow · commit 9af538420aca · 2025-03-14T11:06:10.000Z
This addresses the issue described in #128843
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -466,6 +466,16 @@ static bool isFPIntrinsic(const MachineRegisterInfo &MRI,
   case Intrinsic::aarch64_neon_fminv:
   case Intrinsic::aarch64_neon_fmaxnmv:
   case Intrinsic::aarch64_neon_fminnmv:
+  case Intrinsic::aarch64_neon_fmulx:
+  case Intrinsic::aarch64_neon_frecpe:
+  case Intrinsic::aarch64_neon_frecps:
+  case Intrinsic::aarch64_neon_frecpx:
+  case Intrinsic::aarch64_neon_frsqrte:
+  case Intrinsic::aarch64_neon_frsqrts:
+  case Intrinsic::aarch64_neon_facge:
+  case Intrinsic::aarch64_neon_facgt:
+  case Intrinsic::aarch64_neon_fabd:
+  case Intrinsic::aarch64_sisd_fabd:
     return true;
   case Intrinsic::aarch64_neon_saddlv: {
     const LLT SrcTy = MRI.getType(MI.getOperand(2).getReg());
@@ -540,6 +550,24 @@ bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI,
   case TargetOpcode::G_LROUND:
   case TargetOpcode::G_LLROUND:
     return true;
+  case TargetOpcode::G_INTRINSIC:
+    switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
+    case Intrinsic::aarch64_neon_fcvtas:
+    case Intrinsic::aarch64_neon_fcvtau:
+    case Intrinsic::aarch64_neon_fcvtzs:
+    case Intrinsic::aarch64_neon_fcvtzu:
+    case Intrinsic::aarch64_neon_fcvtms:
+    case Intrinsic::aarch64_neon_fcvtmu:
+    case Intrinsic::aarch64_neon_fcvtns:
+    case Intrinsic::aarch64_neon_fcvtnu:
+    case Intrinsic::aarch64_neon_fcvtps:
+    case Intrinsic::aarch64_neon_fcvtpu:
+      // Force FPR register bank for half types, as those types otherwise
+      // don't get legalized correctly resulting in fp16 <-> gpr32 COPY's.
+      return MRI.getType(MI.getOperand(2).getReg()) == LLT::float16();
+    default:
+      break;
+    }
   default:
     break;
   }
@@ -1082,24 +1110,41 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     break;
   case TargetOpcode::G_INTRINSIC:
   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: {
-    // Check if we know that the intrinsic has any constraints on its register
-    // banks. If it does, then update the mapping accordingly.
-    unsigned Idx = 0;
-    if (onlyDefinesFP(MI, MRI, TRI))
-      for (const auto &Op : MI.defs()) {
-        if (Op.isReg())
-          OpRegBankIdx[Idx] = PMI_FirstFPR;
-        ++Idx;
-      }
-    else
-      Idx += MI.getNumExplicitDefs();
-
-    if (onlyUsesFP(MI, MRI, TRI))
-      for (const auto &Op : MI.explicit_uses()) {
-        if (Op.isReg())
-          OpRegBankIdx[Idx] = PMI_FirstFPR;
-        ++Idx;
-      }
+    switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
+    case Intrinsic::aarch64_neon_vcvtfxs2fp:
+    case Intrinsic::aarch64_neon_vcvtfxu2fp:
+    case Intrinsic::aarch64_neon_vcvtfp2fxs:
+    case Intrinsic::aarch64_neon_vcvtfp2fxu:
+      // Override these intrinsics, because they would have a partial
+      // mapping. This is needed for 'half' types, which otherwise don't
+      // get legalised correctly.
+      OpRegBankIdx[0] = PMI_FirstFPR;
+      OpRegBankIdx[2] = PMI_FirstFPR;
+      // OpRegBankIdx[1] is the intrinsic ID.
+      // OpRegBankIdx[3] is an integer immediate.
+      break;
+    default: {
+      // Check if we know that the intrinsic has any constraints on its register
+      // banks. If it does, then update the mapping accordingly.
+      unsigned Idx = 0;
+      if (onlyDefinesFP(MI, MRI, TRI))
+        for (const auto &Op : MI.defs()) {
+          if (Op.isReg())
+            OpRegBankIdx[Idx] = PMI_FirstFPR;
+          ++Idx;
+        }
+      else
+        Idx += MI.getNumExplicitDefs();
+
+      if (onlyUsesFP(MI, MRI, TRI))
+        for (const auto &Op : MI.explicit_uses()) {
+          if (Op.isReg())
+            OpRegBankIdx[Idx] = PMI_FirstFPR;
+          ++Idx;
+        }
+      break;
+    }
+    }
     break;
   }
   case TargetOpcode::G_LROUND:
diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -2,10 +2,6 @@
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck -check-prefixes=CHECK,CHECK-SD %s
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:  warning: Instruction selection used fallback path for fabds
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fabdd
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uabd_i64
-
 define <8 x i16> @sabdl8h(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: sabdl8h:
 ; CHECK:       // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+fullfp16  | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -global-isel=0 -mattr=+v8.2a,+fullfp16  | FileCheck %s --check-prefixes=CHECK,SDISEL
+; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -mattr=+v8.2a,+fullfp16  | FileCheck %s --check-prefixes=CHECK,GISEL
 
 declare i64 @llvm.aarch64.neon.fcvtpu.i64.f16(half)
 declare i32 @llvm.aarch64.neon.fcvtpu.i32.f16(half)
@@ -26,59 +27,94 @@ declare half @llvm.aarch64.neon.frecpx.f16(half)
 declare half @llvm.aarch64.neon.frecpe.f16(half)
 
 define dso_local i16 @t2(half %a) {
-; CHECK-LABEL: t2:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcmp h0, #0.0
-; CHECK-NEXT:    csetm w0, eq
-; CHECK-NEXT:    ret
+; SDISEL-LABEL: t2:
+; SDISEL:       // %bb.0: // %entry
+; SDISEL-NEXT:    fcmp h0, #0.0
+; SDISEL-NEXT:    csetm w0, eq
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: t2:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    fcmp h0, #0.0
+; GISEL-NEXT:    cset w8, eq
+; GISEL-NEXT:    sbfx w0, w8, #0, #1
+; GISEL-NEXT:    ret
 entry:
   %0 = fcmp oeq half %a, 0xH0000
   %vceqz = sext i1 %0 to i16
   ret i16 %vceqz
 }
 
 define dso_local i16 @t3(half %a) {
-; CHECK-LABEL: t3:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcmp h0, #0.0
-; CHECK-NEXT:    csetm w0, ge
-; CHECK-NEXT:    ret
+; SDISEL-LABEL: t3:
+; SDISEL:       // %bb.0: // %entry
+; SDISEL-NEXT:    fcmp h0, #0.0
+; SDISEL-NEXT:    csetm w0, ge
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: t3:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    fcmp h0, #0.0
+; GISEL-NEXT:    cset w8, ge
+; GISEL-NEXT:    sbfx w0, w8, #0, #1
+; GISEL-NEXT:    ret
 entry:
   %0 = fcmp oge half %a, 0xH0000
   %vcgez = sext i1 %0 to i16
   ret i16 %vcgez
 }
 
 define dso_local i16 @t4(half %a) {
-; CHECK-LABEL: t4:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcmp h0, #0.0
-; CHECK-NEXT:    csetm w0, gt
-; CHECK-NEXT:    ret
+; SDISEL-LABEL: t4:
+; SDISEL:       // %bb.0: // %entry
+; SDISEL-NEXT:    fcmp h0, #0.0
+; SDISEL-NEXT:    csetm w0, gt
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: t4:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    fcmp h0, #0.0
+; GISEL-NEXT:    cset w8, gt
+; GISEL-NEXT:    sbfx w0, w8, #0, #1
+; GISEL-NEXT:    ret
 entry:
   %0 = fcmp ogt half %a, 0xH0000
   %vcgtz = sext i1 %0 to i16
   ret i16 %vcgtz
 }
 
 define dso_local i16 @t5(half %a) {
-; CHECK-LABEL: t5:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcmp h0, #0.0
-; CHECK-NEXT:    csetm w0, ls
-; CHECK-NEXT:    ret
+; SDISEL-LABEL: t5:
+; SDISEL:       // %bb.0: // %entry
+; SDISEL-NEXT:    fcmp h0, #0.0
+; SDISEL-NEXT:    csetm w0, ls
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: t5:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    fcmp h0, #0.0
+; GISEL-NEXT:    cset w8, ls
+; GISEL-NEXT:    sbfx w0, w8, #0, #1
+; GISEL-NEXT:    ret
 entry:
   %0 = fcmp ole half %a, 0xH0000
   %vclez = sext i1 %0 to i16
   ret i16 %vclez
 }
 
 define dso_local i16 @t6(half %a) {
-; CHECK-LABEL: t6:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcmp h0, #0.0
-; CHECK-NEXT:    csetm w0, mi
-; CHECK-NEXT:    ret
+; SDISEL-LABEL: t6:
+; SDISEL:       // %bb.0: // %entry
+; SDISEL-NEXT:    fcmp h0, #0.0
+; SDISEL-NEXT:    csetm w0, mi
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: t6:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    fcmp h0, #0.0
+; GISEL-NEXT:    cset w8, mi
+; GISEL-NEXT:    sbfx w0, w8, #0, #1
+; GISEL-NEXT:    ret
 entry:
   %0 = fcmp olt half %a, 0xH0000
   %vcltz = sext i1 %0 to i16
@@ -136,10 +172,15 @@ entry:
 }
 
 define dso_local i16 @t16(half %a) {
-; CHECK-LABEL: t16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzs w0, h0
-; CHECK-NEXT:    ret
+; SDISEL-LABEL: t16:
+; SDISEL:       // %bb.0: // %entry
+; SDISEL-NEXT:    fcvtzs w0, h0
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: t16:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    fcvtzu w0, h0
+; GISEL-NEXT:    ret
 entry:
   %0 = fptoui half %a to i16
   ret i16 %0
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll