Skip to content

[RISCV] Fold (vXi8 (trunc (vselect (setltu, X, 256), X, (sext (setgt X, 0))))) to vmax+vnclipu. #94720

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 73 additions & 2 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1452,7 +1452,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
if (Subtarget.hasStdExtZbb())
setTargetDAGCombine({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN});

if (Subtarget.hasStdExtZbs() && Subtarget.is64Bit())
if ((Subtarget.hasStdExtZbs() && Subtarget.is64Bit()) ||
Subtarget.hasStdExtV())
setTargetDAGCombine(ISD::TRUNCATE);

if (Subtarget.hasStdExtZbkb())
Expand Down Expand Up @@ -13370,6 +13371,76 @@ static SDValue combineDeMorganOfBoolean(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(ISD::XOR, DL, VT, Logic, DAG.getConstant(1, DL, VT));
}

// Fold (vXi8 (trunc (vselect (setltu, X, 256), X, (sext (setgt X, 0))))) to
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An alternate, or possible generalization, would be to fold the trunc through the select as:
(vselect (setltu, X, 256), (trunc X), (sext (setgt X, 0)))) -- the second trunc folds into the sext to a narrower type.

We could also see the source expressed that way, so we probably want to canonicalize one to the other at least.

If we have (vselect (X > 0), 255, 0) - and variants - do we reliably combine that into the sext of condition form?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we have (vselect (X > 0), 255, 0) - and variants - do we reliably combine that into the sext of condition form?

I think InstCombine and DAGCombine both will from a quick test.

// (vXi8 (trunc (smin (smax X, 0), 255))). This represents saturating a signed
// value to an unsigned value. This will be lowered to vmax and series of
// vnclipu instructions later. This can be extended to other truncated types
// other than i8 by replacing 256 and 255 with the equivalent constants for the
// type.
static SDValue combineTruncSelectToSMaxUSat(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
EVT SrcVT = N0.getValueType();

const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!VT.isVector() || !TLI.isTypeLegal(VT) || !TLI.isTypeLegal(SrcVT))
return SDValue();

if (N0.getOpcode() != ISD::VSELECT || !N0.hasOneUse())
return SDValue();

SDValue Cond = N0.getOperand(0);
SDValue True = N0.getOperand(1);
SDValue False = N0.getOperand(2);

if (Cond.getOpcode() != ISD::SETCC)
return SDValue();

// FIXME: Support the version of this pattern with the select operands
// swapped.
ISD::CondCode CCVal = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
if (CCVal != ISD::SETULT)
return SDValue();

SDValue CondLHS = Cond.getOperand(0);
SDValue CondRHS = Cond.getOperand(1);

if (CondLHS != True)
return SDValue();

unsigned ScalarBits = VT.getScalarSizeInBits();

// FIXME: Support other constants.
ConstantSDNode *CondRHSC = isConstOrConstSplat(CondRHS);
if (!CondRHSC || CondRHSC->getAPIntValue() != (1ULL << ScalarBits))
return SDValue();

if (False.getOpcode() != ISD::SIGN_EXTEND)
return SDValue();

False = False.getOperand(0);

if (False.getOpcode() != ISD::SETCC || False.getOperand(0) != True)
return SDValue();

ConstantSDNode *FalseRHSC = isConstOrConstSplat(False.getOperand(1));
if (!FalseRHSC || !FalseRHSC->isZero())
return SDValue();

ISD::CondCode CCVal2 = cast<CondCodeSDNode>(False.getOperand(2))->get();
if (CCVal2 != ISD::SETGT)
return SDValue();

// Emit the signed to unsigned saturation pattern.
SDLoc DL(N);
SDValue Max =
DAG.getNode(ISD::SMAX, DL, SrcVT, True, DAG.getConstant(0, DL, SrcVT));
SDValue Min =
DAG.getNode(ISD::SMIN, DL, SrcVT, Max,
DAG.getConstant((1ULL << ScalarBits) - 1, DL, SrcVT));
return DAG.getNode(ISD::TRUNCATE, DL, VT, Min);
}

static SDValue performTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
Expand All @@ -13390,7 +13461,7 @@ static SDValue performTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Srl);
}

return SDValue();
return combineTruncSelectToSMaxUSat(N, DAG);
}

// Combines two comparison operation and logic operation to one selection
Expand Down
210 changes: 210 additions & 0 deletions llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s

define <4 x i8> @test_v4i16_v4i8(<4 x i16> %x) {
; CHECK-LABEL: test_v4i16_v4i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; CHECK-NEXT: vmax.vx v8, v8, zero
; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
; CHECK-NEXT: vnclipu.wi v8, v8, 0
; CHECK-NEXT: ret
%a = icmp sgt <4 x i16> %x, zeroinitializer
%b = sext <4 x i1> %a to <4 x i16>
%c = icmp ult <4 x i16> %x, splat (i16 256)
%d = select <4 x i1> %c, <4 x i16> %x, <4 x i16> %b
%e = trunc <4 x i16> %d to <4 x i8>
ret <4 x i8> %e
}

define <4 x i8> @test_v4i32_v4i8(<4 x i32> %x) {
; CHECK-LABEL: test_v4i32_v4i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmax.vx v8, v8, zero
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; CHECK-NEXT: vnclipu.wi v8, v8, 0
; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
; CHECK-NEXT: vnclipu.wi v8, v8, 0
; CHECK-NEXT: ret
%a = icmp sgt <4 x i32> %x, zeroinitializer
%b = sext <4 x i1> %a to <4 x i32>
%c = icmp ult <4 x i32> %x, splat (i32 256)
%d = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %b
%e = trunc <4 x i32> %d to <4 x i8>
ret <4 x i8> %e
}

define <4 x i8> @test_v4i64_v4i8(<4 x i64> %x) {
; CHECK-LABEL: test_v4i64_v4i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-NEXT: vmax.vx v8, v8, zero
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-NEXT: vnclipu.wi v10, v8, 0
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; CHECK-NEXT: vnclipu.wi v8, v10, 0
; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
; CHECK-NEXT: vnclipu.wi v8, v8, 0
; CHECK-NEXT: ret
%a = icmp sgt <4 x i64> %x, zeroinitializer
%b = sext <4 x i1> %a to <4 x i64>
%c = icmp ult <4 x i64> %x, splat (i64 256)
%d = select <4 x i1> %c, <4 x i64> %x, <4 x i64> %b
%e = trunc <4 x i64> %d to <4 x i8>
ret <4 x i8> %e
}

define <4 x i16> @test_v4i32_v4i16(<4 x i32> %x) {
; CHECK-LABEL: test_v4i32_v4i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmax.vx v8, v8, zero
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; CHECK-NEXT: vnclipu.wi v8, v8, 0
; CHECK-NEXT: ret
%a = icmp sgt <4 x i32> %x, zeroinitializer
%b = sext <4 x i1> %a to <4 x i32>
%c = icmp ult <4 x i32> %x, splat (i32 65536)
%d = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %b
%e = trunc <4 x i32> %d to <4 x i16>
ret <4 x i16> %e
}

define <4 x i16> @test_v4i64_v4i16(<4 x i64> %x) {
; CHECK-LABEL: test_v4i64_v4i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-NEXT: vmax.vx v8, v8, zero
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-NEXT: vnclipu.wi v10, v8, 0
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; CHECK-NEXT: vnclipu.wi v8, v10, 0
; CHECK-NEXT: ret
%a = icmp sgt <4 x i64> %x, zeroinitializer
%b = sext <4 x i1> %a to <4 x i64>
%c = icmp ult <4 x i64> %x, splat (i64 65536)
%d = select <4 x i1> %c, <4 x i64> %x, <4 x i64> %b
%e = trunc <4 x i64> %d to <4 x i16>
ret <4 x i16> %e
}

define <4 x i32> @test_v4i64_v4i32(<4 x i64> %x) {
; CHECK-LABEL: test_v4i64_v4i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-NEXT: vmax.vx v10, v8, zero
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-NEXT: vnclipu.wi v8, v10, 0
; CHECK-NEXT: ret
%a = icmp sgt <4 x i64> %x, zeroinitializer
%b = sext <4 x i1> %a to <4 x i64>
%c = icmp ult <4 x i64> %x, splat (i64 4294967296)
%d = select <4 x i1> %c, <4 x i64> %x, <4 x i64> %b
%e = trunc <4 x i64> %d to <4 x i32>
ret <4 x i32> %e
}

define <vscale x 4 x i8> @test_nxv4i16_nxv4i8(<vscale x 4 x i16> %x) {
; CHECK-LABEL: test_nxv4i16_nxv4i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
; CHECK-NEXT: vmax.vx v8, v8, zero
; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
; CHECK-NEXT: vnclipu.wi v8, v8, 0
; CHECK-NEXT: ret
%a = icmp sgt <vscale x 4 x i16> %x, zeroinitializer
%b = sext <vscale x 4 x i1> %a to <vscale x 4 x i16>
%c = icmp ult <vscale x 4 x i16> %x, splat (i16 256)
%d = select <vscale x 4 x i1> %c, <vscale x 4 x i16> %x, <vscale x 4 x i16> %b
%e = trunc <vscale x 4 x i16> %d to <vscale x 4 x i8>
ret <vscale x 4 x i8> %e
}

define <vscale x 4 x i8> @test_nxv4i32_nxv4i8(<vscale x 4 x i32> %x) {
; CHECK-LABEL: test_nxv4i32_nxv4i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
; CHECK-NEXT: vmax.vx v8, v8, zero
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vnclipu.wi v10, v8, 0
; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
; CHECK-NEXT: vnclipu.wi v8, v10, 0
; CHECK-NEXT: ret
%a = icmp sgt <vscale x 4 x i32> %x, zeroinitializer
%b = sext <vscale x 4 x i1> %a to <vscale x 4 x i32>
%c = icmp ult <vscale x 4 x i32> %x, splat (i32 256)
%d = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %x, <vscale x 4 x i32> %b
%e = trunc <vscale x 4 x i32> %d to <vscale x 4 x i8>
ret <vscale x 4 x i8> %e
}

define <vscale x 4 x i8> @test_nxv4i64_nxv4i8(<vscale x 4 x i64> %x) {
; CHECK-LABEL: test_nxv4i64_nxv4i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
; CHECK-NEXT: vmax.vx v8, v8, zero
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vnclipu.wi v12, v8, 0
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vnclipu.wi v8, v12, 0
; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
; CHECK-NEXT: vnclipu.wi v8, v8, 0
; CHECK-NEXT: ret
%a = icmp sgt <vscale x 4 x i64> %x, zeroinitializer
%b = sext <vscale x 4 x i1> %a to <vscale x 4 x i64>
%c = icmp ult <vscale x 4 x i64> %x, splat (i64 256)
%d = select <vscale x 4 x i1> %c, <vscale x 4 x i64> %x, <vscale x 4 x i64> %b
%e = trunc <vscale x 4 x i64> %d to <vscale x 4 x i8>
ret <vscale x 4 x i8> %e
}

define <vscale x 4 x i16> @test_nxv4i32_nxv4i16(<vscale x 4 x i32> %x) {
; CHECK-LABEL: test_nxv4i32_nxv4i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
; CHECK-NEXT: vmax.vx v10, v8, zero
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vnclipu.wi v8, v10, 0
; CHECK-NEXT: ret
%a = icmp sgt <vscale x 4 x i32> %x, zeroinitializer
%b = sext <vscale x 4 x i1> %a to <vscale x 4 x i32>
%c = icmp ult <vscale x 4 x i32> %x, splat (i32 65536)
%d = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %x, <vscale x 4 x i32> %b
%e = trunc <vscale x 4 x i32> %d to <vscale x 4 x i16>
ret <vscale x 4 x i16> %e
}

define <vscale x 4 x i16> @test_nxv4i64_nxv4i16(<vscale x 4 x i64> %x) {
; CHECK-LABEL: test_nxv4i64_nxv4i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
; CHECK-NEXT: vmax.vx v8, v8, zero
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vnclipu.wi v12, v8, 0
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vnclipu.wi v8, v12, 0
; CHECK-NEXT: ret
%a = icmp sgt <vscale x 4 x i64> %x, zeroinitializer
%b = sext <vscale x 4 x i1> %a to <vscale x 4 x i64>
%c = icmp ult <vscale x 4 x i64> %x, splat (i64 65536)
%d = select <vscale x 4 x i1> %c, <vscale x 4 x i64> %x, <vscale x 4 x i64> %b
%e = trunc <vscale x 4 x i64> %d to <vscale x 4 x i16>
ret <vscale x 4 x i16> %e
}

define <vscale x 4 x i32> @test_nxv4i64_nxv4i32(<vscale x 4 x i64> %x) {
; CHECK-LABEL: test_nxv4i64_nxv4i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
; CHECK-NEXT: vmax.vx v12, v8, zero
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vnclipu.wi v8, v12, 0
; CHECK-NEXT: ret
%a = icmp sgt <vscale x 4 x i64> %x, zeroinitializer
%b = sext <vscale x 4 x i1> %a to <vscale x 4 x i64>
%c = icmp ult <vscale x 4 x i64> %x, splat (i64 4294967296)
%d = select <vscale x 4 x i1> %c, <vscale x 4 x i64> %x, <vscale x 4 x i64> %b
%e = trunc <vscale x 4 x i64> %d to <vscale x 4 x i32>
ret <vscale x 4 x i32> %e
}
Loading