@@ -857,6 +857,67 @@ define <4 x double> @shuffle_v4f64_2345_0567_select(<4 x double> %vec1, <4 x dou
857
857
ret <4 x double > %res
858
858
}
859
859
860
+ ; PR140234
861
+ define <4 x double > @shuffle_v4f64_1436_split_load (ptr %px , ptr %py ) {
862
+ ; AVX1-LABEL: shuffle_v4f64_1436_split_load:
863
+ ; AVX1: # %bb.0:
864
+ ; AVX1-NEXT: vmovapd (%rsi), %xmm0
865
+ ; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
866
+ ; AVX1-NEXT: vmovupd (%rdi), %ymm1
867
+ ; AVX1-NEXT: vinsertf128 $1, 16(%rsi), %ymm0, %ymm0
868
+ ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[2]
869
+ ; AVX1-NEXT: retq
870
+ ;
871
+ ; AVX2-LABEL: shuffle_v4f64_1436_split_load:
872
+ ; AVX2: # %bb.0:
873
+ ; AVX2-NEXT: vmovapd (%rsi), %xmm0
874
+ ; AVX2-NEXT: vmovupd (%rdi), %ymm1
875
+ ; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[3]
876
+ ; AVX2-NEXT: vbroadcastsd 16(%rsi), %ymm1
877
+ ; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
878
+ ; AVX2-NEXT: retq
879
+ ;
880
+ ; AVX512VL-SLOW-LABEL: shuffle_v4f64_1436_split_load:
881
+ ; AVX512VL-SLOW: # %bb.0:
882
+ ; AVX512VL-SLOW-NEXT: vmovapd (%rsi), %xmm0
883
+ ; AVX512VL-SLOW-NEXT: vmovupd (%rdi), %ymm1
884
+ ; AVX512VL-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[3]
885
+ ; AVX512VL-SLOW-NEXT: vbroadcastsd 16(%rsi), %ymm1
886
+ ; AVX512VL-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
887
+ ; AVX512VL-SLOW-NEXT: retq
888
+ ;
889
+ ; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_1436_split_load:
890
+ ; AVX512VL-FAST-ALL: # %bb.0:
891
+ ; AVX512VL-FAST-ALL-NEXT: vmovapd (%rsi), %xmm0
892
+ ; AVX512VL-FAST-ALL-NEXT: vmovapd 16(%rsi), %xmm1
893
+ ; AVX512VL-FAST-ALL-NEXT: vmovupd (%rdi), %ymm2
894
+ ; AVX512VL-FAST-ALL-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm0[0],ymm2[3],ymm0[3]
895
+ ; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,1,2,4]
896
+ ; AVX512VL-FAST-ALL-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0
897
+ ; AVX512VL-FAST-ALL-NEXT: retq
898
+ ;
899
+ ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4f64_1436_split_load:
900
+ ; AVX512VL-FAST-PERLANE: # %bb.0:
901
+ ; AVX512VL-FAST-PERLANE-NEXT: vmovapd (%rsi), %xmm0
902
+ ; AVX512VL-FAST-PERLANE-NEXT: vmovupd (%rdi), %ymm1
903
+ ; AVX512VL-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[3]
904
+ ; AVX512VL-FAST-PERLANE-NEXT: vbroadcastsd 16(%rsi), %ymm1
905
+ ; AVX512VL-FAST-PERLANE-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
906
+ ; AVX512VL-FAST-PERLANE-NEXT: retq
907
+ %pxhi = getelementptr inbounds nuw i8 , ptr %px , i64 16
908
+ %pyhi = getelementptr inbounds nuw i8 , ptr %py , i64 16
909
+ %x0 = load <2 x double >, ptr %px , align 16
910
+ %y0 = load <2 x double >, ptr %py , align 16
911
+ %x1 = load <2 x double >, ptr %pxhi , align 16
912
+ %y1 = load <2 x double >, ptr %pyhi , align 16
913
+ %shuf0 = shufflevector <2 x double > %x0 , <2 x double > %y0 , <4 x i32 > <i32 1 , i32 2 , i32 poison, i32 poison>
914
+ %shuf1 = shufflevector <2 x double > %x1 , <2 x double > poison, <4 x i32 > <i32 poison, i32 1 , i32 poison, i32 poison>
915
+ %shuf2 = shufflevector <4 x double > %shuf0 , <4 x double > %shuf1 , <4 x i32 > <i32 0 , i32 1 , i32 5 , i32 poison>
916
+ %shuf3 = shufflevector <2 x double > %y1 , <2 x double > poison, <4 x i32 > <i32 0 , i32 poison, i32 poison, i32 poison>
917
+ %shuf4 = shufflevector <4 x double > %shuf2 , <4 x double > %shuf3 , <4 x i32 > <i32 0 , i32 1 , i32 2 , i32 4 >
918
+ ret <4 x double > %shuf4
919
+ }
920
+
860
921
define <4 x i64 > @shuffle_v4i64_0000 (<4 x i64 > %a , <4 x i64 > %b ) {
861
922
; AVX1-LABEL: shuffle_v4i64_0000:
862
923
; AVX1: # %bb.0:
0 commit comments