Skip to content

Commit dd7ba38

Browse files
committed
[X86] matchTruncateWithPACK - consistently prefer shuffles for truncation to sub-64-bit vXi16
If we're truncating from v2i32 / v2i64 then PSHUFLW / PSHUFD+PSHUFLW should more easily allow further shuffle combines than a PACK chain will
1 parent 7b8f5f7 commit dd7ba38

File tree

6 files changed

+161
-117
lines changed

6 files changed

+161
-117
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20134,12 +20134,12 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
2013420134
assert(SrcSVT.getSizeInBits() > DstSVT.getSizeInBits() && "Bad truncation");
2013520135
unsigned NumStages = Log2_32(SrcSVT.getSizeInBits() / DstSVT.getSizeInBits());
2013620136

20137-
// Truncation to sub-128bit vXi32 can be better handled with shuffles.
20138-
if (DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128)
20139-
return SDValue();
20140-
20137+
// Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
20138+
// Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
2014120139
// Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
20142-
if (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3())
20140+
if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
20141+
(DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
20142+
(DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
2014320143
return SDValue();
2014420144

2014520145
// Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply

llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2524,15 +2524,15 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
25242524
; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
25252525
; SSE2-NEXT: pcmpeqd %xmm0, %xmm4
25262526
; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2527-
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
2528-
; SSE2-NEXT: pand %xmm4, %xmm6
2529-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
2530-
; SSE2-NEXT: por %xmm6, %xmm0
2531-
; SSE2-NEXT: pand %xmm0, %xmm5
2532-
; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2533-
; SSE2-NEXT: por %xmm5, %xmm0
2534-
; SSE2-NEXT: packssdw %xmm0, %xmm0
2535-
; SSE2-NEXT: packssdw %xmm0, %xmm0
2527+
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
2528+
; SSE2-NEXT: pand %xmm4, %xmm0
2529+
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2530+
; SSE2-NEXT: por %xmm0, %xmm3
2531+
; SSE2-NEXT: pand %xmm3, %xmm5
2532+
; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2533+
; SSE2-NEXT: por %xmm5, %xmm3
2534+
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2535+
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
25362536
; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
25372537
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
25382538
; SSE2-NEXT: pand %xmm1, %xmm2
@@ -2567,8 +2567,8 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
25672567
; SSE4-NEXT: movapd %xmm4, %xmm0
25682568
; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
25692569
; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2
2570-
; SSE4-NEXT: packssdw %xmm2, %xmm2
2571-
; SSE4-NEXT: packssdw %xmm2, %xmm2
2570+
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2571+
; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
25722572
; SSE4-NEXT: pcmpeqq %xmm1, %xmm3
25732573
; SSE4-NEXT: movmskpd %xmm3, %eax
25742574
; SSE4-NEXT: xorl $3, %eax
@@ -2580,11 +2580,11 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
25802580
; SSE4-NEXT: .LBB7_4: # %else2
25812581
; SSE4-NEXT: retq
25822582
; SSE4-NEXT: .LBB7_1: # %cond.store
2583-
; SSE4-NEXT: pextrw $0, %xmm2, (%rdi)
2583+
; SSE4-NEXT: pextrw $0, %xmm0, (%rdi)
25842584
; SSE4-NEXT: testb $2, %al
25852585
; SSE4-NEXT: je .LBB7_4
25862586
; SSE4-NEXT: .LBB7_3: # %cond.store1
2587-
; SSE4-NEXT: pextrw $1, %xmm2, 2(%rdi)
2587+
; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi)
25882588
; SSE4-NEXT: retq
25892589
;
25902590
; AVX1-LABEL: truncstore_v2i64_v2i16:
@@ -2598,8 +2598,8 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
25982598
; AVX1-NEXT: # xmm3 = mem[0,0]
25992599
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4
26002600
; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
2601-
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
2602-
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
2601+
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
2602+
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
26032603
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
26042604
; AVX1-NEXT: vmovmskpd %xmm1, %eax
26052605
; AVX1-NEXT: xorl $3, %eax
@@ -2627,8 +2627,8 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
26272627
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
26282628
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4
26292629
; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
2630-
; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
2631-
; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
2630+
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
2631+
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
26322632
; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
26332633
; AVX2-NEXT: vmovmskpd %xmm1, %eax
26342634
; AVX2-NEXT: xorl $3, %eax

llvm/test/CodeGen/X86/masked_store_trunc_usat.ll

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2205,17 +2205,17 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
22052205
; SSE4-LABEL: truncstore_v2i64_v2i16:
22062206
; SSE4: # %bb.0:
22072207
; SSE4-NEXT: movdqa %xmm0, %xmm2
2208-
; SSE4-NEXT: pxor %xmm4, %xmm4
2209-
; SSE4-NEXT: movapd {{.*#+}} xmm3 = [65535,65535]
2208+
; SSE4-NEXT: pxor %xmm3, %xmm3
2209+
; SSE4-NEXT: movapd {{.*#+}} xmm4 = [65535,65535]
22102210
; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
22112211
; SSE4-NEXT: pxor %xmm0, %xmm5
22122212
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854841343,9223372036854841343]
22132213
; SSE4-NEXT: pcmpgtq %xmm5, %xmm0
2214-
; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3
2215-
; SSE4-NEXT: packusdw %xmm3, %xmm3
2216-
; SSE4-NEXT: packusdw %xmm3, %xmm3
2217-
; SSE4-NEXT: pcmpeqq %xmm1, %xmm4
2218-
; SSE4-NEXT: movmskpd %xmm4, %eax
2214+
; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4
2215+
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
2216+
; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2217+
; SSE4-NEXT: pcmpeqq %xmm1, %xmm3
2218+
; SSE4-NEXT: movmskpd %xmm3, %eax
22192219
; SSE4-NEXT: xorl $3, %eax
22202220
; SSE4-NEXT: testb $1, %al
22212221
; SSE4-NEXT: jne .LBB7_1
@@ -2225,11 +2225,11 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
22252225
; SSE4-NEXT: .LBB7_4: # %else2
22262226
; SSE4-NEXT: retq
22272227
; SSE4-NEXT: .LBB7_1: # %cond.store
2228-
; SSE4-NEXT: pextrw $0, %xmm3, (%rdi)
2228+
; SSE4-NEXT: pextrw $0, %xmm0, (%rdi)
22292229
; SSE4-NEXT: testb $2, %al
22302230
; SSE4-NEXT: je .LBB7_4
22312231
; SSE4-NEXT: .LBB7_3: # %cond.store1
2232-
; SSE4-NEXT: pextrw $1, %xmm3, 2(%rdi)
2232+
; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi)
22332233
; SSE4-NEXT: retq
22342234
;
22352235
; AVX1-LABEL: truncstore_v2i64_v2i16:
@@ -2242,8 +2242,8 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
22422242
; AVX1-NEXT: # xmm5 = mem[0,0]
22432243
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
22442244
; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
2245-
; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
2246-
; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
2245+
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
2246+
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
22472247
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
22482248
; AVX1-NEXT: vmovmskpd %xmm1, %eax
22492249
; AVX1-NEXT: xorl $3, %eax
@@ -2271,8 +2271,8 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
22712271
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343]
22722272
; AVX2-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
22732273
; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
2274-
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
2275-
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
2274+
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
2275+
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
22762276
; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
22772277
; AVX2-NEXT: vmovmskpd %xmm1, %eax
22782278
; AVX2-NEXT: xorl $3, %eax

llvm/test/CodeGen/X86/vector-trunc-packus.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -910,7 +910,7 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) {
910910
; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
911911
; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1
912912
; AVX2-FAST-NEXT: vpand %xmm0, %xmm1, %xmm0
913-
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
913+
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
914914
; AVX2-FAST-NEXT: retq
915915
;
916916
; AVX512F-LABEL: trunc_packus_v2i64_v2i16:

llvm/test/CodeGen/X86/vector-trunc-ssat.ll

Lines changed: 70 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -825,21 +825,21 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) {
825825
; SSE2-SSSE3-NEXT: por %xmm2, %xmm3
826826
; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
827827
; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
828-
; SSE2-SSSE3-NEXT: por %xmm3, %xmm0
829-
; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1
830-
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
831-
; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm3
832-
; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
828+
; SSE2-SSSE3-NEXT: por %xmm0, %xmm3
829+
; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm1
830+
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
831+
; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
832+
; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2
833833
; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
834-
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
835-
; SSE2-SSSE3-NEXT: pand %xmm3, %xmm2
834+
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
835+
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
836836
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
837-
; SSE2-SSSE3-NEXT: por %xmm2, %xmm1
838-
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
837+
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
838+
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3
839839
; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
840-
; SSE2-SSSE3-NEXT: por %xmm1, %xmm0
841-
; SSE2-SSSE3-NEXT: packssdw %xmm0, %xmm0
842-
; SSE2-SSSE3-NEXT: packssdw %xmm0, %xmm0
840+
; SSE2-SSSE3-NEXT: por %xmm3, %xmm1
841+
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
842+
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
843843
; SSE2-SSSE3-NEXT: retq
844844
;
845845
; SSE41-LABEL: trunc_ssat_v2i64_v2i16:
@@ -866,9 +866,8 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) {
866866
; SSE41-NEXT: pand %xmm4, %xmm0
867867
; SSE41-NEXT: por %xmm3, %xmm0
868868
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
869-
; SSE41-NEXT: packssdw %xmm1, %xmm1
870-
; SSE41-NEXT: packssdw %xmm1, %xmm1
871-
; SSE41-NEXT: movdqa %xmm1, %xmm0
869+
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
870+
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
872871
; SSE41-NEXT: retq
873872
;
874873
; AVX1-LABEL: trunc_ssat_v2i64_v2i16:
@@ -881,21 +880,32 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) {
881880
; AVX1-NEXT: # xmm1 = mem[0,0]
882881
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
883882
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
884-
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
885-
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
883+
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
884+
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
886885
; AVX1-NEXT: retq
887886
;
888-
; AVX2-LABEL: trunc_ssat_v2i64_v2i16:
889-
; AVX2: # %bb.0:
890-
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
891-
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
892-
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
893-
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
894-
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
895-
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
896-
; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
897-
; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
898-
; AVX2-NEXT: retq
887+
; AVX2-SLOW-LABEL: trunc_ssat_v2i64_v2i16:
888+
; AVX2-SLOW: # %bb.0:
889+
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
890+
; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
891+
; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
892+
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
893+
; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
894+
; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
895+
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
896+
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
897+
; AVX2-SLOW-NEXT: retq
898+
;
899+
; AVX2-FAST-LABEL: trunc_ssat_v2i64_v2i16:
900+
; AVX2-FAST: # %bb.0:
901+
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
902+
; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
903+
; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
904+
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
905+
; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
906+
; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
907+
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
908+
; AVX2-FAST-NEXT: retq
899909
;
900910
; AVX512F-LABEL: trunc_ssat_v2i64_v2i16:
901911
; AVX512F: # %bb.0:
@@ -963,9 +973,9 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
963973
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3
964974
; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
965975
; SSE2-SSSE3-NEXT: por %xmm3, %xmm1
966-
; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm1
967-
; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm1
968-
; SSE2-SSSE3-NEXT: movd %xmm1, (%rdi)
976+
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
977+
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
978+
; SSE2-SSSE3-NEXT: movd %xmm0, (%rdi)
969979
; SSE2-SSSE3-NEXT: retq
970980
;
971981
; SSE41-LABEL: trunc_ssat_v2i64_v2i16_store:
@@ -992,9 +1002,9 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
9921002
; SSE41-NEXT: pand %xmm4, %xmm0
9931003
; SSE41-NEXT: por %xmm3, %xmm0
9941004
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
995-
; SSE41-NEXT: packssdw %xmm1, %xmm1
996-
; SSE41-NEXT: packssdw %xmm1, %xmm1
997-
; SSE41-NEXT: movd %xmm1, (%rdi)
1005+
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
1006+
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1007+
; SSE41-NEXT: movd %xmm0, (%rdi)
9981008
; SSE41-NEXT: retq
9991009
;
10001010
; AVX1-LABEL: trunc_ssat_v2i64_v2i16_store:
@@ -1007,23 +1017,35 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
10071017
; AVX1-NEXT: # xmm1 = mem[0,0]
10081018
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
10091019
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1010-
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
1011-
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
1020+
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
1021+
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
10121022
; AVX1-NEXT: vmovd %xmm0, (%rdi)
10131023
; AVX1-NEXT: retq
10141024
;
1015-
; AVX2-LABEL: trunc_ssat_v2i64_v2i16_store:
1016-
; AVX2: # %bb.0:
1017-
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
1018-
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
1019-
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1020-
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
1021-
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1022-
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1023-
; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
1024-
; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
1025-
; AVX2-NEXT: vmovd %xmm0, (%rdi)
1026-
; AVX2-NEXT: retq
1025+
; AVX2-SLOW-LABEL: trunc_ssat_v2i64_v2i16_store:
1026+
; AVX2-SLOW: # %bb.0:
1027+
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
1028+
; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
1029+
; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1030+
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
1031+
; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1032+
; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1033+
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
1034+
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1035+
; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi)
1036+
; AVX2-SLOW-NEXT: retq
1037+
;
1038+
; AVX2-FAST-LABEL: trunc_ssat_v2i64_v2i16_store:
1039+
; AVX2-FAST: # %bb.0:
1040+
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
1041+
; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
1042+
; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1043+
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
1044+
; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1045+
; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1046+
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
1047+
; AVX2-FAST-NEXT: vmovd %xmm0, (%rdi)
1048+
; AVX2-FAST-NEXT: retq
10271049
;
10281050
; AVX512F-LABEL: trunc_ssat_v2i64_v2i16_store:
10291051
; AVX512F: # %bb.0:

0 commit comments

Comments
 (0)