diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 8ace5d79af079..a3007dc0296ae 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1452,7 +1452,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (Subtarget.hasStdExtZbb()) setTargetDAGCombine({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}); - if (Subtarget.hasStdExtZbs() && Subtarget.is64Bit()) + if ((Subtarget.hasStdExtZbs() && Subtarget.is64Bit()) || + Subtarget.hasStdExtV()) setTargetDAGCombine(ISD::TRUNCATE); if (Subtarget.hasStdExtZbkb()) @@ -13370,6 +13371,76 @@ static SDValue combineDeMorganOfBoolean(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::XOR, DL, VT, Logic, DAG.getConstant(1, DL, VT)); } +// Fold (vXi8 (trunc (vselect (setltu, X, 256), X, (sext (setgt X, 0))))) to +// (vXi8 (trunc (smin (smax X, 0), 255))). This represents saturating a signed +// value to an unsigned value. This will be lowered to vmax and series of +// vnclipu instructions later. This can be extended to other truncated types +// other than i8 by replacing 256 and 255 with the equivalent constants for the +// type. +static SDValue combineTruncSelectToSMaxUSat(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + EVT SrcVT = N0.getValueType(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!VT.isVector() || !TLI.isTypeLegal(VT) || !TLI.isTypeLegal(SrcVT)) + return SDValue(); + + if (N0.getOpcode() != ISD::VSELECT || !N0.hasOneUse()) + return SDValue(); + + SDValue Cond = N0.getOperand(0); + SDValue True = N0.getOperand(1); + SDValue False = N0.getOperand(2); + + if (Cond.getOpcode() != ISD::SETCC) + return SDValue(); + + // FIXME: Support the version of this pattern with the select operands + // swapped. + ISD::CondCode CCVal = cast(Cond.getOperand(2))->get(); + if (CCVal != ISD::SETULT) + return SDValue(); + + SDValue CondLHS = Cond.getOperand(0); + SDValue CondRHS = Cond.getOperand(1); + + if (CondLHS != True) + return SDValue(); + + unsigned ScalarBits = VT.getScalarSizeInBits(); + + // FIXME: Support other constants. + ConstantSDNode *CondRHSC = isConstOrConstSplat(CondRHS); + if (!CondRHSC || CondRHSC->getAPIntValue() != (1ULL << ScalarBits)) + return SDValue(); + + if (False.getOpcode() != ISD::SIGN_EXTEND) + return SDValue(); + + False = False.getOperand(0); + + if (False.getOpcode() != ISD::SETCC || False.getOperand(0) != True) + return SDValue(); + + ConstantSDNode *FalseRHSC = isConstOrConstSplat(False.getOperand(1)); + if (!FalseRHSC || !FalseRHSC->isZero()) + return SDValue(); + + ISD::CondCode CCVal2 = cast(False.getOperand(2))->get(); + if (CCVal2 != ISD::SETGT) + return SDValue(); + + // Emit the signed to unsigned saturation pattern. + SDLoc DL(N); + SDValue Max = + DAG.getNode(ISD::SMAX, DL, SrcVT, True, DAG.getConstant(0, DL, SrcVT)); + SDValue Min = + DAG.getNode(ISD::SMIN, DL, SrcVT, Max, + DAG.getConstant((1ULL << ScalarBits) - 1, DL, SrcVT)); + return DAG.getNode(ISD::TRUNCATE, DL, VT, Min); +} + static SDValue performTRUNCATECombine(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { SDValue N0 = N->getOperand(0); @@ -13390,7 +13461,7 @@ static SDValue performTRUNCATECombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Srl); } - return SDValue(); + return combineTruncSelectToSMaxUSat(N, DAG); } // Combines two comparison operation and logic operation to one selection diff --git a/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll b/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll new file mode 100644 index 0000000000000..28d7588b9347a --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll @@ -0,0 +1,210 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s + +define <4 x i8> @test_v4i16_v4i8(<4 x i16> %x) { +; CHECK-LABEL: test_v4i16_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: ret + %a = icmp sgt <4 x i16> %x, zeroinitializer + %b = sext <4 x i1> %a to <4 x i16> + %c = icmp ult <4 x i16> %x, splat (i16 256) + %d = select <4 x i1> %c, <4 x i16> %x, <4 x i16> %b + %e = trunc <4 x i16> %d to <4 x i8> + ret <4 x i8> %e +} + +define <4 x i8> @test_v4i32_v4i8(<4 x i32> %x) { +; CHECK-LABEL: test_v4i32_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: ret + %a = icmp sgt <4 x i32> %x, zeroinitializer + %b = sext <4 x i1> %a to <4 x i32> + %c = icmp ult <4 x i32> %x, splat (i32 256) + %d = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %b + %e = trunc <4 x i32> %d to <4 x i8> + ret <4 x i8> %e +} + +define <4 x i8> @test_v4i64_v4i8(<4 x i64> %x) { +; CHECK-LABEL: test_v4i64_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vnclipu.wi v10, v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: ret + %a = icmp sgt <4 x i64> %x, zeroinitializer + %b = sext <4 x i1> %a to <4 x i64> + %c = icmp ult <4 x i64> %x, splat (i64 256) + %d = select <4 x i1> %c, <4 x i64> %x, <4 x i64> %b + %e = trunc <4 x i64> %d to <4 x i8> + ret <4 x i8> %e +} + +define <4 x i16> @test_v4i32_v4i16(<4 x i32> %x) { +; CHECK-LABEL: test_v4i32_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: ret + %a = icmp sgt <4 x i32> %x, zeroinitializer + %b = sext <4 x i1> %a to <4 x i32> + %c = icmp ult <4 x i32> %x, splat (i32 65536) + %d = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %b + %e = trunc <4 x i32> %d to <4 x i16> + ret <4 x i16> %e +} + +define <4 x i16> @test_v4i64_v4i16(<4 x i64> %x) { +; CHECK-LABEL: test_v4i64_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vnclipu.wi v10, v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-NEXT: ret + %a = icmp sgt <4 x i64> %x, zeroinitializer + %b = sext <4 x i1> %a to <4 x i64> + %c = icmp ult <4 x i64> %x, splat (i64 65536) + %d = select <4 x i1> %c, <4 x i64> %x, <4 x i64> %b + %e = trunc <4 x i64> %d to <4 x i16> + ret <4 x i16> %e +} + +define <4 x i32> @test_v4i64_v4i32(<4 x i64> %x) { +; CHECK-LABEL: test_v4i64_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vmax.vx v10, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-NEXT: ret + %a = icmp sgt <4 x i64> %x, zeroinitializer + %b = sext <4 x i1> %a to <4 x i64> + %c = icmp ult <4 x i64> %x, splat (i64 4294967296) + %d = select <4 x i1> %c, <4 x i64> %x, <4 x i64> %b + %e = trunc <4 x i64> %d to <4 x i32> + ret <4 x i32> %e +} + +define @test_nxv4i16_nxv4i8( %x) { +; CHECK-LABEL: test_nxv4i16_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: ret + %a = icmp sgt %x, zeroinitializer + %b = sext %a to + %c = icmp ult %x, splat (i16 256) + %d = select %c, %x, %b + %e = trunc %d to + ret %e +} + +define @test_nxv4i32_nxv4i8( %x) { +; CHECK-LABEL: test_nxv4i32_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vnclipu.wi v10, v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-NEXT: ret + %a = icmp sgt %x, zeroinitializer + %b = sext %a to + %c = icmp ult %x, splat (i32 256) + %d = select %c, %x, %b + %e = trunc %d to + ret %e +} + +define @test_nxv4i64_nxv4i8( %x) { +; CHECK-LABEL: test_nxv4i64_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vnclipu.wi v12, v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vnclipu.wi v8, v12, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: ret + %a = icmp sgt %x, zeroinitializer + %b = sext %a to + %c = icmp ult %x, splat (i64 256) + %d = select %c, %x, %b + %e = trunc %d to + ret %e +} + +define @test_nxv4i32_nxv4i16( %x) { +; CHECK-LABEL: test_nxv4i32_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vmax.vx v10, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-NEXT: ret + %a = icmp sgt %x, zeroinitializer + %b = sext %a to + %c = icmp ult %x, splat (i32 65536) + %d = select %c, %x, %b + %e = trunc %d to + ret %e +} + +define @test_nxv4i64_nxv4i16( %x) { +; CHECK-LABEL: test_nxv4i64_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vnclipu.wi v12, v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vnclipu.wi v8, v12, 0 +; CHECK-NEXT: ret + %a = icmp sgt %x, zeroinitializer + %b = sext %a to + %c = icmp ult %x, splat (i64 65536) + %d = select %c, %x, %b + %e = trunc %d to + ret %e +} + +define @test_nxv4i64_nxv4i32( %x) { +; CHECK-LABEL: test_nxv4i64_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vmax.vx v12, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vnclipu.wi v8, v12, 0 +; CHECK-NEXT: ret + %a = icmp sgt %x, zeroinitializer + %b = sext %a to + %c = icmp ult %x, splat (i64 4294967296) + %d = select %c, %x, %b + %e = trunc %d to + ret %e +}