From 643ca92fb1a8957d38f78464571f39287b9e6b5f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 23 Jun 2025 14:03:59 +0100 Subject: [PATCH] [X86][VBMI2] Try to lower shuffle as VSHLDI instructions Fixes #145276 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 85 ++++++++++++++++ llvm/test/CodeGen/X86/shuffle-as-shifts.ll | 112 +++++++++++++++++---- 2 files changed, 177 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2541182de1208..37c553b1343ce 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12096,6 +12096,46 @@ static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, return DAG.getBitcast(VT, Res); } +static SDValue lowerShuffleAsVSHLD(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + if (!Subtarget.hasVBMI2()) + return SDValue(); + if (!Subtarget.hasVLX() && !VT.is512BitVector()) + return SDValue(); + + unsigned ScalarSizeInBits = VT.getScalarSizeInBits(); + for (int Scale = 2; Scale * ScalarSizeInBits <= 64; Scale *= 2) { + unsigned LaneSize = Scale * ScalarSizeInBits; + SmallVector RepeatedMask; + if (isRepeatedShuffleMask(LaneSize, VT, Mask, RepeatedMask)) { + for (int Shift = 1; Shift != Scale; ++Shift) { + MVT ShiftVT = MVT::getVectorVT(MVT::getIntegerVT(LaneSize), + VT.getSizeInBits() / LaneSize); + unsigned Offset = Scale - Shift; + if (isSequentialOrUndefInRange(RepeatedMask, 0, Shift, + Scale + Offset) && + isSequentialOrUndefInRange(RepeatedMask, Shift, Offset, 0)) { + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::VSHLD, DL, ShiftVT, V1, V2, + DAG.getTargetConstant(Shift * ScalarSizeInBits, + DL, MVT::i8))); + } + if (isSequentialOrUndefInRange(RepeatedMask, 0, Shift, Offset) && + isSequentialOrUndefInRange(RepeatedMask, Shift, Offset, 0)) { + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::VSHLD, DL, ShiftVT, V2, V1, + DAG.getTargetConstant(Shift * ScalarSizeInBits, + DL, MVT::i8))); + } + } + } + } + + return SDValue(); +} + /// Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and @@ -13789,6 +13829,11 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef Mask, return Rotate; } + // Try to use funnel shift instructions. + if (SDValue Funnel = + lowerShuffleAsVSHLD(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) + return Funnel; + // Assume that a single SHUFPS is faster than an alternative sequence of // multiple instructions (even if the CPU has a domain penalty). // If some CPU is harmed by the domain switch, we can fix it in a later pass. @@ -14507,6 +14552,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef Mask, Subtarget, DAG)) return Rotate; + // Try to use funnel shift instructions. + if (SDValue Funnel = + lowerShuffleAsVSHLD(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) + return Funnel; + if (SDValue BitBlend = lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) return BitBlend; @@ -14702,6 +14752,11 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, Subtarget, DAG)) return Rotate; + // Try to use funnel shift instructions. + if (SDValue Funnel = + lowerShuffleAsVSHLD(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) + return Funnel; + // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) @@ -16861,6 +16916,11 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef Mask, Subtarget, DAG)) return Rotate; + // Try to use funnel shift instructions. + if (SDValue Funnel = + lowerShuffleAsVSHLD(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) + return Funnel; + // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( @@ -16955,6 +17015,11 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef Mask, Subtarget, DAG)) return Rotate; + // Try to use funnel shift instructions. + if (SDValue Funnel = + lowerShuffleAsVSHLD(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) + return Funnel; + // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( @@ -17078,6 +17143,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef Mask, Subtarget, DAG)) return Rotate; + // Try to use funnel shift instructions. + if (SDValue Funnel = + lowerShuffleAsVSHLD(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) + return Funnel; + // Try to use bit rotation instructions. if (V2.isUndef()) if (SDValue Rotate = @@ -17590,6 +17660,11 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef Mask, Subtarget, DAG)) return Rotate; + // Try to use funnel shift instructions. + if (SDValue Funnel = + lowerShuffleAsVSHLD(DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) + return Funnel; + // Assume that a single SHUFPS is faster than using a permv shuffle. // If some CPU is harmed by the domain switch, we can fix it in a later pass. if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { @@ -17655,6 +17730,11 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef Mask, Subtarget, DAG)) return Rotate; + // Try to use funnel shift instructions. + if (SDValue Funnel = + lowerShuffleAsVSHLD(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG)) + return Funnel; + if (V2.isUndef()) { // Try to use bit rotation instructions. if (SDValue Rotate = @@ -17726,6 +17806,11 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef Mask, Subtarget, DAG)) return Rotate; + // Try to use funnel shift instructions. + if (SDValue Funnel = + lowerShuffleAsVSHLD(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) + return Funnel; + // Try to use bit rotation instructions. if (V2.isUndef()) if (SDValue Rotate = diff --git a/llvm/test/CodeGen/X86/shuffle-as-shifts.ll b/llvm/test/CodeGen/X86/shuffle-as-shifts.ll index 9c8729b3ea505..78309712a9b4e 100644 --- a/llvm/test/CodeGen/X86/shuffle-as-shifts.ll +++ b/llvm/test/CodeGen/X86/shuffle-as-shifts.ll @@ -105,22 +105,58 @@ define <8 x i16> @shuf_rot_v8i16_10325476(<8 x i16> %x) { } define <16 x i16> @shuf_rot_v16i16_1032547698111013121514(<16 x i16> %x) { -; CHECK-LABEL: shuf_rot_v16i16_1032547698111013121514: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddw %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: vprold $16, %ymm0, %ymm0 -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: shuf_rot_v16i16_1032547698111013121514: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpaddw %ymm0, %ymm0, %ymm0 +; CHECK-SKX-NEXT: vprold $16, %ymm0, %ymm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: shuf_rot_v16i16_1032547698111013121514: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: vpaddw %ymm0, %ymm0, %ymm0 +; CHECK-ICX-NEXT: vpshldd $16, %ymm0, %ymm0, %ymm0 +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: shuf_rot_v16i16_1032547698111013121514: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vpaddw %ymm0, %ymm0, %ymm0 +; CHECK-V4-NEXT: vprold $16, %ymm0, %ymm0 +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: shuf_rot_v16i16_1032547698111013121514: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vpaddw %ymm0, %ymm0, %ymm0 +; CHECK-ZNVER4-NEXT: vpshldd $16, %ymm0, %ymm0, %ymm0 +; CHECK-ZNVER4-NEXT: retq %x1 = add <16 x i16> %x, %x %r = shufflevector <16 x i16> %x1, <16 x i16> zeroinitializer, <16 x i32> ret <16 x i16> %r } define <32 x i16> @shuf_rot_v32i16_1234056749101181314151217181916212223202527272429303128(<32 x i16> %x) { -; CHECK-LABEL: shuf_rot_v32i16_1234056749101181314151217181916212223202527272429303128: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddw %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: vprolq $48, %zmm0, %zmm0 -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: shuf_rot_v32i16_1234056749101181314151217181916212223202527272429303128: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpaddw %zmm0, %zmm0, %zmm0 +; CHECK-SKX-NEXT: vprolq $48, %zmm0, %zmm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: shuf_rot_v32i16_1234056749101181314151217181916212223202527272429303128: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: vpaddw %zmm0, %zmm0, %zmm0 +; CHECK-ICX-NEXT: vpshldq $48, %zmm0, %zmm0, %zmm0 +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: shuf_rot_v32i16_1234056749101181314151217181916212223202527272429303128: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vpaddw %zmm0, %zmm0, %zmm0 +; CHECK-V4-NEXT: vprolq $48, %zmm0, %zmm0 +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: shuf_rot_v32i16_1234056749101181314151217181916212223202527272429303128: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vpaddw %zmm0, %zmm0, %zmm0 +; CHECK-ZNVER4-NEXT: vpshldq $48, %zmm0, %zmm0, %zmm0 +; CHECK-ZNVER4-NEXT: retq %x1 = add <32 x i16> %x, %x %r = shufflevector <32 x i16> %x1, <32 x i16> zeroinitializer, <32 x i32> ret <32 x i16> %r @@ -138,22 +174,58 @@ define <16 x i8> @shuf_rot_v16i8_2301674510118914151213(<16 x i8> %x) { } define <32 x i8> @shuf_rot_v32i8_230167451011891415121318191617222320212627242530312829(<32 x i8> %x) { -; CHECK-LABEL: shuf_rot_v32i8_230167451011891415121318191617222320212627242530312829: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddb %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: vprold $16, %ymm0, %ymm0 -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: shuf_rot_v32i8_230167451011891415121318191617222320212627242530312829: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpaddb %ymm0, %ymm0, %ymm0 +; CHECK-SKX-NEXT: vprold $16, %ymm0, %ymm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: shuf_rot_v32i8_230167451011891415121318191617222320212627242530312829: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: vpaddb %ymm0, %ymm0, %ymm0 +; CHECK-ICX-NEXT: vpshldd $16, %ymm0, %ymm0, %ymm0 +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: shuf_rot_v32i8_230167451011891415121318191617222320212627242530312829: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vpaddb %ymm0, %ymm0, %ymm0 +; CHECK-V4-NEXT: vprold $16, %ymm0, %ymm0 +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: shuf_rot_v32i8_230167451011891415121318191617222320212627242530312829: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vpaddb %ymm0, %ymm0, %ymm0 +; CHECK-ZNVER4-NEXT: vpshldd $16, %ymm0, %ymm0, %ymm0 +; CHECK-ZNVER4-NEXT: retq %x1 = add <32 x i8> %x, %x %r = shufflevector <32 x i8> %x1, <32 x i8> zeroinitializer, <32 x i32> ret <32 x i8> %r } define <64 x i8> @shuf_rot_v64i8_3012745611891015121314191617182320212227242526312829303532333439363738434041424744454651484950555253545956575863606162(<64 x i8> %x) { -; CHECK-LABEL: shuf_rot_v64i8_3012745611891015121314191617182320212227242526312829303532333439363738434041424744454651484950555253545956575863606162: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddb %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: vprold $8, %zmm0, %zmm0 -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: shuf_rot_v64i8_3012745611891015121314191617182320212227242526312829303532333439363738434041424744454651484950555253545956575863606162: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpaddb %zmm0, %zmm0, %zmm0 +; CHECK-SKX-NEXT: vprold $8, %zmm0, %zmm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: shuf_rot_v64i8_3012745611891015121314191617182320212227242526312829303532333439363738434041424744454651484950555253545956575863606162: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: vpaddb %zmm0, %zmm0, %zmm0 +; CHECK-ICX-NEXT: vpshldd $8, %zmm0, %zmm0, %zmm0 +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: shuf_rot_v64i8_3012745611891015121314191617182320212227242526312829303532333439363738434041424744454651484950555253545956575863606162: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vpaddb %zmm0, %zmm0, %zmm0 +; CHECK-V4-NEXT: vprold $8, %zmm0, %zmm0 +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: shuf_rot_v64i8_3012745611891015121314191617182320212227242526312829303532333439363738434041424744454651484950555253545956575863606162: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vpaddb %zmm0, %zmm0, %zmm0 +; CHECK-ZNVER4-NEXT: vpshldd $8, %zmm0, %zmm0, %zmm0 +; CHECK-ZNVER4-NEXT: retq %x1 = add <64 x i8> %x, %x %r = shufflevector <64 x i8> %x1, <64 x i8> zeroinitializer, <64 x i32> ret <64 x i8> %r