@@ -2831,15 +2831,15 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
2831
2831
; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0]
2832
2832
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm9, %zmm2
2833
2833
; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm2
2834
- ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm9
2835
- ; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm10
2834
+ ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm10
2835
+ ; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm9
2836
2836
; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128]
2837
- ; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm10 , %ymm4
2838
- ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10 [0,1,1,1]
2837
+ ; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm9 , %ymm4
2838
+ ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9 [0,1,1,1]
2839
2839
; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535]
2840
- ; AVX512F-SLOW-NEXT: vpandnq %ymm10 , %ymm21, %ymm10
2841
- ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm10 , %zmm10
2842
- ; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10
2840
+ ; AVX512F-SLOW-NEXT: vpandnq %ymm9 , %ymm21, %ymm9
2841
+ ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm9 , %zmm9
2842
+ ; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm9
2843
2843
; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm2
2844
2844
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3]
2845
2845
; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm4
@@ -2860,7 +2860,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
2860
2860
; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
2861
2861
; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm7, %zmm4
2862
2862
; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm2
2863
- ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm9 [0,1,1,1]
2863
+ ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm10 [0,1,1,1]
2864
2864
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2
2865
2865
; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2
2866
2866
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5]
@@ -2909,16 +2909,15 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
2909
2909
; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
2910
2910
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1
2911
2911
; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm1
2912
- ; AVX512F-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm0
2913
- ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535]
2914
- ; AVX512F-SLOW-NEXT: vpandn %ymm0, %ymm3, %ymm0
2915
- ; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm9, %ymm3
2916
- ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
2912
+ ; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm10, %ymm0
2913
+ ; AVX512F-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm3
2914
+ ; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
2915
+ ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
2917
2916
; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
2918
2917
; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 64(%r9)
2919
2918
; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 256(%r9)
2920
2919
; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%r9)
2921
- ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10 , 192(%r9)
2920
+ ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9 , 192(%r9)
2922
2921
; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 128(%r9)
2923
2922
; AVX512F-SLOW-NEXT: vzeroupper
2924
2923
; AVX512F-SLOW-NEXT: retq
@@ -3019,11 +3018,10 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
3019
3018
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm7
3020
3019
; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
3021
3020
; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm20, %zmm7
3021
+ ; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm3
3022
+ ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3022
3023
; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm3
3023
- ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535]
3024
- ; AVX512F-FAST-NEXT: vpandn %ymm3, %ymm13, %ymm3
3025
- ; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm11
3026
- ; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0
3024
+ ; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
3027
3025
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
3028
3026
; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm0
3029
3027
; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29]
0 commit comments