Skip to content

Commit 7dc4d5f

Browse files
committed
[X86] Add AVX512 (x86-64-v4) coverage to generic shift combines tests
1 parent 34acdb3 commit 7dc4d5f

File tree

3 files changed

+293
-142
lines changed

3 files changed

+293
-142
lines changed

llvm/test/CodeGen/X86/combine-shl.ll

Lines changed: 159 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
33
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
4-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-SLOW
5-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST-ALL
6-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST-PERLANE
4+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
5+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-ALL
6+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-PERLANE
7+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
78

89
; fold (shl 0, x) -> 0
910
define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) {
@@ -137,32 +138,40 @@ define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) {
137138
; SSE41-NEXT: pmulld %xmm1, %xmm0
138139
; SSE41-NEXT: retq
139140
;
140-
; AVX-SLOW-LABEL: combine_vec_shl_trunc_and:
141-
; AVX-SLOW: # %bb.0:
142-
; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
143-
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
144-
; AVX-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
145-
; AVX-SLOW-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
146-
; AVX-SLOW-NEXT: vzeroupper
147-
; AVX-SLOW-NEXT: retq
148-
;
149-
; AVX-FAST-ALL-LABEL: combine_vec_shl_trunc_and:
150-
; AVX-FAST-ALL: # %bb.0:
151-
; AVX-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0]
152-
; AVX-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
153-
; AVX-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
154-
; AVX-FAST-ALL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
155-
; AVX-FAST-ALL-NEXT: vzeroupper
156-
; AVX-FAST-ALL-NEXT: retq
157-
;
158-
; AVX-FAST-PERLANE-LABEL: combine_vec_shl_trunc_and:
159-
; AVX-FAST-PERLANE: # %bb.0:
160-
; AVX-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2
161-
; AVX-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
162-
; AVX-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
163-
; AVX-FAST-PERLANE-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
164-
; AVX-FAST-PERLANE-NEXT: vzeroupper
165-
; AVX-FAST-PERLANE-NEXT: retq
141+
; AVX2-SLOW-LABEL: combine_vec_shl_trunc_and:
142+
; AVX2-SLOW: # %bb.0:
143+
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
144+
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
145+
; AVX2-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
146+
; AVX2-SLOW-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
147+
; AVX2-SLOW-NEXT: vzeroupper
148+
; AVX2-SLOW-NEXT: retq
149+
;
150+
; AVX2-FAST-ALL-LABEL: combine_vec_shl_trunc_and:
151+
; AVX2-FAST-ALL: # %bb.0:
152+
; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0]
153+
; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
154+
; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
155+
; AVX2-FAST-ALL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
156+
; AVX2-FAST-ALL-NEXT: vzeroupper
157+
; AVX2-FAST-ALL-NEXT: retq
158+
;
159+
; AVX2-FAST-PERLANE-LABEL: combine_vec_shl_trunc_and:
160+
; AVX2-FAST-PERLANE: # %bb.0:
161+
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2
162+
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
163+
; AVX2-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
164+
; AVX2-FAST-PERLANE-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
165+
; AVX2-FAST-PERLANE-NEXT: vzeroupper
166+
; AVX2-FAST-PERLANE-NEXT: retq
167+
;
168+
; AVX512-LABEL: combine_vec_shl_trunc_and:
169+
; AVX512: # %bb.0:
170+
; AVX512-NEXT: vpmovqd %ymm1, %xmm1
171+
; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
172+
; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
173+
; AVX512-NEXT: vzeroupper
174+
; AVX512-NEXT: retq
166175
%1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
167176
%2 = trunc <4 x i64> %1 to <4 x i32>
168177
%3 = shl <4 x i32> %x, %2
@@ -353,11 +362,17 @@ define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) {
353362
; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
354363
; SSE41-NEXT: retq
355364
;
356-
; AVX-LABEL: combine_vec_shl_zext_lshr0:
357-
; AVX: # %bb.0:
358-
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
359-
; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
360-
; AVX-NEXT: retq
365+
; AVX2-LABEL: combine_vec_shl_zext_lshr0:
366+
; AVX2: # %bb.0:
367+
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
368+
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
369+
; AVX2-NEXT: retq
370+
;
371+
; AVX512-LABEL: combine_vec_shl_zext_lshr0:
372+
; AVX512: # %bb.0:
373+
; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
374+
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
375+
; AVX512-NEXT: retq
361376
%1 = lshr <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
362377
%2 = zext <8 x i16> %1 to <8 x i32>
363378
%3 = shl <8 x i32> %2, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
@@ -504,12 +519,18 @@ define <4 x i32> @combine_vec_shl_gt_lshr0(<4 x i32> %x) {
504519
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
505520
; SSE-NEXT: retq
506521
;
507-
; AVX-LABEL: combine_vec_shl_gt_lshr0:
508-
; AVX: # %bb.0:
509-
; AVX-NEXT: vpslld $2, %xmm0, %xmm0
510-
; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
511-
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
512-
; AVX-NEXT: retq
522+
; AVX2-LABEL: combine_vec_shl_gt_lshr0:
523+
; AVX2: # %bb.0:
524+
; AVX2-NEXT: vpslld $2, %xmm0, %xmm0
525+
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
526+
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
527+
; AVX2-NEXT: retq
528+
;
529+
; AVX512-LABEL: combine_vec_shl_gt_lshr0:
530+
; AVX512: # %bb.0:
531+
; AVX512-NEXT: vpslld $2, %xmm0, %xmm0
532+
; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
533+
; AVX512-NEXT: retq
513534
%1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
514535
%2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
515536
ret <4 x i32> %2
@@ -540,12 +561,18 @@ define <4 x i32> @combine_vec_shl_le_lshr0(<4 x i32> %x) {
540561
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
541562
; SSE-NEXT: retq
542563
;
543-
; AVX-LABEL: combine_vec_shl_le_lshr0:
544-
; AVX: # %bb.0:
545-
; AVX-NEXT: vpsrld $2, %xmm0, %xmm0
546-
; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816]
547-
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
548-
; AVX-NEXT: retq
564+
; AVX2-LABEL: combine_vec_shl_le_lshr0:
565+
; AVX2: # %bb.0:
566+
; AVX2-NEXT: vpsrld $2, %xmm0, %xmm0
567+
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816]
568+
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
569+
; AVX2-NEXT: retq
570+
;
571+
; AVX512-LABEL: combine_vec_shl_le_lshr0:
572+
; AVX512: # %bb.0:
573+
; AVX512-NEXT: vpsrld $2, %xmm0, %xmm0
574+
; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
575+
; AVX512-NEXT: retq
549576
%1 = lshr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
550577
%2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
551578
ret <4 x i32> %2
@@ -587,11 +614,16 @@ define <4 x i32> @combine_vec_shl_ashr0(<4 x i32> %x) {
587614
; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
588615
; SSE-NEXT: retq
589616
;
590-
; AVX-LABEL: combine_vec_shl_ashr0:
591-
; AVX: # %bb.0:
592-
; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
593-
; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
594-
; AVX-NEXT: retq
617+
; AVX2-LABEL: combine_vec_shl_ashr0:
618+
; AVX2: # %bb.0:
619+
; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
620+
; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
621+
; AVX2-NEXT: retq
622+
;
623+
; AVX512-LABEL: combine_vec_shl_ashr0:
624+
; AVX512: # %bb.0:
625+
; AVX512-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
626+
; AVX512-NEXT: retq
595627
%1 = ashr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
596628
%2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
597629
ret <4 x i32> %2
@@ -620,12 +652,18 @@ define <4 x i32> @combine_vec_shl_add0(<4 x i32> %x) {
620652
; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
621653
; SSE-NEXT: retq
622654
;
623-
; AVX-LABEL: combine_vec_shl_add0:
624-
; AVX: # %bb.0:
625-
; AVX-NEXT: vpslld $2, %xmm0, %xmm0
626-
; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
627-
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
628-
; AVX-NEXT: retq
655+
; AVX2-LABEL: combine_vec_shl_add0:
656+
; AVX2: # %bb.0:
657+
; AVX2-NEXT: vpslld $2, %xmm0, %xmm0
658+
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
659+
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
660+
; AVX2-NEXT: retq
661+
;
662+
; AVX512-LABEL: combine_vec_shl_add0:
663+
; AVX512: # %bb.0:
664+
; AVX512-NEXT: vpslld $2, %xmm0, %xmm0
665+
; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
666+
; AVX512-NEXT: retq
629667
%1 = add <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
630668
%2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
631669
ret <4 x i32> %2
@@ -667,12 +705,18 @@ define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {
667705
; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
668706
; SSE-NEXT: retq
669707
;
670-
; AVX-LABEL: combine_vec_shl_or0:
671-
; AVX: # %bb.0:
672-
; AVX-NEXT: vpslld $2, %xmm0, %xmm0
673-
; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
674-
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
675-
; AVX-NEXT: retq
708+
; AVX2-LABEL: combine_vec_shl_or0:
709+
; AVX2: # %bb.0:
710+
; AVX2-NEXT: vpslld $2, %xmm0, %xmm0
711+
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
712+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
713+
; AVX2-NEXT: retq
714+
;
715+
; AVX512-LABEL: combine_vec_shl_or0:
716+
; AVX512: # %bb.0:
717+
; AVX512-NEXT: vpslld $2, %xmm0, %xmm0
718+
; AVX512-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
719+
; AVX512-NEXT: retq
676720
%1 = or <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
677721
%2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
678722
ret <4 x i32> %2
@@ -724,11 +768,16 @@ define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {
724768
; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
725769
; SSE41-NEXT: retq
726770
;
727-
; AVX-LABEL: combine_vec_shl_mul0:
728-
; AVX: # %bb.0:
729-
; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
730-
; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
731-
; AVX-NEXT: retq
771+
; AVX2-LABEL: combine_vec_shl_mul0:
772+
; AVX2: # %bb.0:
773+
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
774+
; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
775+
; AVX2-NEXT: retq
776+
;
777+
; AVX512-LABEL: combine_vec_shl_mul0:
778+
; AVX512: # %bb.0:
779+
; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
780+
; AVX512-NEXT: retq
732781
%1 = mul <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
733782
%2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
734783
ret <4 x i32> %2
@@ -778,12 +827,18 @@ define <4 x i32> @combine_vec_add_shl_nonsplat(<4 x i32> %a0) {
778827
; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
779828
; SSE41-NEXT: retq
780829
;
781-
; AVX-LABEL: combine_vec_add_shl_nonsplat:
782-
; AVX: # %bb.0:
783-
; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
784-
; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
785-
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
786-
; AVX-NEXT: retq
830+
; AVX2-LABEL: combine_vec_add_shl_nonsplat:
831+
; AVX2: # %bb.0:
832+
; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
833+
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
834+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
835+
; AVX2-NEXT: retq
836+
;
837+
; AVX512-LABEL: combine_vec_add_shl_nonsplat:
838+
; AVX512: # %bb.0:
839+
; AVX512-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
840+
; AVX512-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
841+
; AVX512-NEXT: retq
787842
%1 = shl <4 x i32> %a0, <i32 2, i32 3, i32 4, i32 5>
788843
%2 = add <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
789844
ret <4 x i32> %2
@@ -812,14 +867,22 @@ define <4 x i32> @combine_vec_add_shl_and_nonsplat(<4 x i32> %a0) {
812867
; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
813868
; SSE41-NEXT: retq
814869
;
815-
; AVX-LABEL: combine_vec_add_shl_and_nonsplat:
816-
; AVX: # %bb.0:
817-
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
818-
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
819-
; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
820-
; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
821-
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
822-
; AVX-NEXT: retq
870+
; AVX2-LABEL: combine_vec_add_shl_and_nonsplat:
871+
; AVX2: # %bb.0:
872+
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
873+
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
874+
; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
875+
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
876+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
877+
; AVX2-NEXT: retq
878+
;
879+
; AVX512-LABEL: combine_vec_add_shl_and_nonsplat:
880+
; AVX512: # %bb.0:
881+
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
882+
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
883+
; AVX512-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
884+
; AVX512-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
885+
; AVX512-NEXT: retq
823886
%1 = and <4 x i32> %a0, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
824887
%2 = shl <4 x i32> %1, <i32 2, i32 3, i32 4, i32 5>
825888
%3 = add <4 x i32> %2, <i32 15, i32 15, i32 15, i32 15>
@@ -847,13 +910,20 @@ define <4 x i32> @combine_vec_add_shuffle_shl(<4 x i32> %a0) {
847910
; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
848911
; SSE41-NEXT: retq
849912
;
850-
; AVX-LABEL: combine_vec_add_shuffle_shl:
851-
; AVX: # %bb.0:
852-
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
853-
; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
854-
; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
855-
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
856-
; AVX-NEXT: retq
913+
; AVX2-LABEL: combine_vec_add_shuffle_shl:
914+
; AVX2: # %bb.0:
915+
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
916+
; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
917+
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
918+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
919+
; AVX2-NEXT: retq
920+
;
921+
; AVX512-LABEL: combine_vec_add_shuffle_shl:
922+
; AVX512: # %bb.0:
923+
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
924+
; AVX512-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
925+
; AVX512-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
926+
; AVX512-NEXT: retq
857927
%1 = shl <4 x i32> %a0, <i32 2, i32 3, i32 0, i32 1>
858928
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
859929
%3 = add <4 x i32> %2, <i32 3, i32 3, i32 3, i32 3>

0 commit comments

Comments
 (0)