@@ -825,21 +825,21 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) {
825
825
; SSE2-SSSE3-NEXT: por %xmm2, %xmm3
826
826
; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
827
827
; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
828
- ; SSE2-SSSE3-NEXT: por %xmm3 , %xmm0
829
- ; SSE2-SSSE3-NEXT: pxor %xmm0 , %xmm1
830
- ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
831
- ; SSE2-SSSE3-NEXT: pcmpeqd %xmm3 , %xmm3
832
- ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2 , %xmm3
828
+ ; SSE2-SSSE3-NEXT: por %xmm0 , %xmm3
829
+ ; SSE2-SSSE3-NEXT: pxor %xmm3 , %xmm1
830
+ ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
831
+ ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2 , %xmm2
832
+ ; SSE2-SSSE3-NEXT: pcmpeqd %xmm0 , %xmm2
833
833
; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
834
- ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
835
- ; SSE2-SSSE3-NEXT: pand %xmm3 , %xmm2
834
+ ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
835
+ ; SSE2-SSSE3-NEXT: pand %xmm2 , %xmm0
836
836
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
837
- ; SSE2-SSSE3-NEXT: por %xmm2 , %xmm1
838
- ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
837
+ ; SSE2-SSSE3-NEXT: por %xmm0 , %xmm1
838
+ ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3
839
839
; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
840
- ; SSE2-SSSE3-NEXT: por %xmm1 , %xmm0
841
- ; SSE2-SSSE3-NEXT: packssdw %xmm0, % xmm0
842
- ; SSE2-SSSE3-NEXT: packssdw % xmm0, % xmm0
840
+ ; SSE2-SSSE3-NEXT: por %xmm3 , %xmm1
841
+ ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
842
+ ; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
843
843
; SSE2-SSSE3-NEXT: retq
844
844
;
845
845
; SSE41-LABEL: trunc_ssat_v2i64_v2i16:
@@ -866,9 +866,8 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) {
866
866
; SSE41-NEXT: pand %xmm4, %xmm0
867
867
; SSE41-NEXT: por %xmm3, %xmm0
868
868
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
869
- ; SSE41-NEXT: packssdw %xmm1, %xmm1
870
- ; SSE41-NEXT: packssdw %xmm1, %xmm1
871
- ; SSE41-NEXT: movdqa %xmm1, %xmm0
869
+ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
870
+ ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
872
871
; SSE41-NEXT: retq
873
872
;
874
873
; AVX1-LABEL: trunc_ssat_v2i64_v2i16:
@@ -881,21 +880,32 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) {
881
880
; AVX1-NEXT: # xmm1 = mem[0,0]
882
881
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
883
882
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
884
- ; AVX1-NEXT: vpackssdw %xmm0, % xmm0, % xmm0
885
- ; AVX1-NEXT: vpackssdw %xmm0, % xmm0, % xmm0
883
+ ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
884
+ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
886
885
; AVX1-NEXT: retq
887
886
;
888
- ; AVX2-LABEL: trunc_ssat_v2i64_v2i16:
889
- ; AVX2: # %bb.0:
890
- ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
891
- ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
892
- ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
893
- ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
894
- ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
895
- ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
896
- ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
897
- ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
898
- ; AVX2-NEXT: retq
887
+ ; AVX2-SLOW-LABEL: trunc_ssat_v2i64_v2i16:
888
+ ; AVX2-SLOW: # %bb.0:
889
+ ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
890
+ ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
891
+ ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
892
+ ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
893
+ ; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
894
+ ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
895
+ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
896
+ ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
897
+ ; AVX2-SLOW-NEXT: retq
898
+ ;
899
+ ; AVX2-FAST-LABEL: trunc_ssat_v2i64_v2i16:
900
+ ; AVX2-FAST: # %bb.0:
901
+ ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
902
+ ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
903
+ ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
904
+ ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
905
+ ; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
906
+ ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
907
+ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
908
+ ; AVX2-FAST-NEXT: retq
899
909
;
900
910
; AVX512F-LABEL: trunc_ssat_v2i64_v2i16:
901
911
; AVX512F: # %bb.0:
@@ -963,9 +973,9 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
963
973
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3
964
974
; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
965
975
; SSE2-SSSE3-NEXT: por %xmm3, %xmm1
966
- ; SSE2-SSSE3-NEXT: packssdw %xmm1, % xmm1
967
- ; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm1
968
- ; SSE2-SSSE3-NEXT: movd %xmm1 , (%rdi)
976
+ ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
977
+ ; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
978
+ ; SSE2-SSSE3-NEXT: movd %xmm0 , (%rdi)
969
979
; SSE2-SSSE3-NEXT: retq
970
980
;
971
981
; SSE41-LABEL: trunc_ssat_v2i64_v2i16_store:
@@ -992,9 +1002,9 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
992
1002
; SSE41-NEXT: pand %xmm4, %xmm0
993
1003
; SSE41-NEXT: por %xmm3, %xmm0
994
1004
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
995
- ; SSE41-NEXT: packssdw %xmm1, % xmm1
996
- ; SSE41-NEXT: packssdw %xmm1, %xmm1
997
- ; SSE41-NEXT: movd %xmm1 , (%rdi)
1005
+ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
1006
+ ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1007
+ ; SSE41-NEXT: movd %xmm0 , (%rdi)
998
1008
; SSE41-NEXT: retq
999
1009
;
1000
1010
; AVX1-LABEL: trunc_ssat_v2i64_v2i16_store:
@@ -1007,23 +1017,35 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
1007
1017
; AVX1-NEXT: # xmm1 = mem[0,0]
1008
1018
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1009
1019
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1010
- ; AVX1-NEXT: vpackssdw %xmm0, % xmm0, % xmm0
1011
- ; AVX1-NEXT: vpackssdw %xmm0, % xmm0, % xmm0
1020
+ ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
1021
+ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1012
1022
; AVX1-NEXT: vmovd %xmm0, (%rdi)
1013
1023
; AVX1-NEXT: retq
1014
1024
;
1015
- ; AVX2-LABEL: trunc_ssat_v2i64_v2i16_store:
1016
- ; AVX2: # %bb.0:
1017
- ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
1018
- ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
1019
- ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1020
- ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
1021
- ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1022
- ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1023
- ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
1024
- ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
1025
- ; AVX2-NEXT: vmovd %xmm0, (%rdi)
1026
- ; AVX2-NEXT: retq
1025
+ ; AVX2-SLOW-LABEL: trunc_ssat_v2i64_v2i16_store:
1026
+ ; AVX2-SLOW: # %bb.0:
1027
+ ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
1028
+ ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
1029
+ ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1030
+ ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
1031
+ ; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1032
+ ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1033
+ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
1034
+ ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1035
+ ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi)
1036
+ ; AVX2-SLOW-NEXT: retq
1037
+ ;
1038
+ ; AVX2-FAST-LABEL: trunc_ssat_v2i64_v2i16_store:
1039
+ ; AVX2-FAST: # %bb.0:
1040
+ ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
1041
+ ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
1042
+ ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1043
+ ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
1044
+ ; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
1045
+ ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1046
+ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
1047
+ ; AVX2-FAST-NEXT: vmovd %xmm0, (%rdi)
1048
+ ; AVX2-FAST-NEXT: retq
1027
1049
;
1028
1050
; AVX512F-LABEL: trunc_ssat_v2i64_v2i16_store:
1029
1051
; AVX512F: # %bb.0:
0 commit comments