diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 71aee5038d518..a2d22440ff5ae 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -462,15 +462,19 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in def cvttpd2dq256 : X86Builtin<"_Vector<4, int>(_Vector<4, double>)">; def cvtpd2dq256 : X86Builtin<"_Vector<4, int>(_Vector<4, double>)">; def cvttps2dq256 : X86Builtin<"_Vector<8, int>(_Vector<8, float>)">; - def vperm2f128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">; - def vperm2f128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">; - def vperm2f128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">; foreach Op = ["max", "min"] in { def Op#pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>)">; def Op#ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>)">; } } +let Features = "avx", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { + def vperm2f128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">; + def vperm2f128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">; + def vperm2f128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">; +} + let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def vpermilpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int)">; def vpermilps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">; @@ -567,7 +571,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i def psadbw256 : X86Builtin< "_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">; - def permti256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">; } let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { @@ -575,6 +578,7 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">; def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long " "int>, _Constant int)">; + def permti256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">; def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">; def pavgb256 : X86Builtin<"_Vector<32, unsigned char>(_Vector<32, unsigned char>, _Vector<32, unsigned char>)">; def pavgw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">; diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 59b48968d7b66..1751ebefa36d4 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -5101,6 +5101,29 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, unsigned SrcIdx = (ShuffleMask >> 6) & 0x1; return std::pair{SrcIdx, Offset}; }); + case X86::BI__builtin_ia32_vperm2f128_pd256: + case X86::BI__builtin_ia32_vperm2f128_ps256: + case X86::BI__builtin_ia32_vperm2f128_si256: + case X86::BI__builtin_ia32_permti256: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, + [BuiltinID, Call](unsigned DstIdx, unsigned ShuffleMask) { + const auto NumElements = + Call->getArg(0)->getType()->getAs()->getNumElements(); + const auto PreservedBitsCnt = NumElements >> 2; + const auto ControlBitsCnt = DstIdx >> PreservedBitsCnt << 2; + const auto ControlBits = ShuffleMask >> ControlBitsCnt; + + if (BuiltinID == X86::BI__builtin_ia32_permti256 && + (ControlBits & 0b1000)) + return std::make_pair(0u, -1); + + const auto SrcVecIdx = (ControlBits & 0b10) >> 1; + const auto PreservedBitsMask = (1 << PreservedBitsCnt) - 1; + const int SrcIdx = ((ControlBits & 0b1) << PreservedBitsCnt) | + (DstIdx & PreservedBitsMask); + return std::make_pair(SrcVecIdx, SrcIdx); + }); case X86::BI__builtin_ia32_pshufb128: case X86::BI__builtin_ia32_pshufb256: case X86::BI__builtin_ia32_pshufb512: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index d81496ffd74e0..02c2b2df427d0 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -14330,6 +14330,34 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(ResultElements, E); } + case X86::BI__builtin_ia32_vperm2f128_pd256: + case X86::BI__builtin_ia32_vperm2f128_ps256: + case X86::BI__builtin_ia32_vperm2f128_si256: + case X86::BI__builtin_ia32_permti256: { + APValue R; + if (!evalShuffleGeneric( + Info, E, R, [E](unsigned DstIdx, unsigned ShuffleMask) { + const auto NumElements = E->getArg(0) + ->getType() + ->getAs() + ->getNumElements(); + const auto PreservedBitsCnt = NumElements >> 2; + const auto ControlBitsCnt = DstIdx >> PreservedBitsCnt << 2; + const auto ControlBits = ShuffleMask >> ControlBitsCnt; + + if (E->getBuiltinCallee() == X86::BI__builtin_ia32_permti256 && + (ControlBits & 0b1000)) + return std::make_pair(0u, -1); + + const auto SrcVecIdx = (ControlBits & 0b10) >> 1; + const auto PreservedBitsMask = (1 << PreservedBitsCnt) - 1; + const int SrcIdx = ((ControlBits & 0b1) << PreservedBitsCnt) | + (DstIdx & PreservedBitsMask); + return std::make_pair(SrcVecIdx, SrcIdx); + })) + return false; + return Success(R, E); + } } } diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c index 13da4292c5b92..5e2e947047412 100644 --- a/clang/test/CodeGen/X86/avx-builtins.c +++ b/clang/test/CodeGen/X86/avx-builtins.c @@ -1439,18 +1439,33 @@ __m256d test_mm256_permute2f128_pd(__m256d A, __m256d B) { return _mm256_permute2f128_pd(A, B, 0x31); } +TEST_CONSTEXPR(match_m256d(_mm256_permute2f128_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0xA7), 7.0, 8.0, 5.0, 6.0)); +TEST_CONSTEXPR(match_m256d(_mm256_permute2f128_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0x5F), 7.0, 8.0, 3.0, 4.0)); +TEST_CONSTEXPR(match_m256d(_mm256_permute2f128_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0x37), 7.0, 8.0, 7.0, 8.0)); +TEST_CONSTEXPR(match_m256d(_mm256_permute2f128_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0x12), 5.0, 6.0, 3.0, 4.0)); + __m256 test_mm256_permute2f128_ps(__m256 A, __m256 B) { // CHECK-LABEL: test_mm256_permute2f128_ps // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> return _mm256_permute2f128_ps(A, B, 0x13); } +TEST_CONSTEXPR(match_m256(_mm256_permute2f128_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), 0xA7), 13.0f, 14.0f, 15.0f, 16.0f, 9.0f, 10.0f, 11.0f, 12.0f)); +TEST_CONSTEXPR(match_m256(_mm256_permute2f128_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), 0x5F), 13.0f, 14.0f, 15.0f, 16.0f, 5.0f, 6.0f, 7.0f, 8.0f)); +TEST_CONSTEXPR(match_m256(_mm256_permute2f128_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), 0x37), 13.0f, 14.0f, 15.0f, 16.0f, 13.0f, 14.0f, 15.0f, 16.0f)); +TEST_CONSTEXPR(match_m256(_mm256_permute2f128_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), 0x12), 9.0f, 10.0f, 11.0f, 12.0f, 5.0f, 6.0f, 7.0f, 8.0f)); + __m256i test_mm256_permute2f128_si256(__m256i A, __m256i B) { // CHECK-LABEL: test_mm256_permute2f128_si256 // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> return _mm256_permute2f128_si256(A, B, 0x20); } +TEST_CONSTEXPR(match_m256i(_mm256_permute2f128_si256(((__m256i){1LL, 2LL, 3LL, 4LL}), ((__m256i){5LL, 6LL, 7LL, 8LL}), 0xA7), 7LL, 8LL, 5LL, 6LL)); +TEST_CONSTEXPR(match_m256i(_mm256_permute2f128_si256(((__m256i){1LL, 2LL, 3LL, 4LL}), ((__m256i){5LL, 6LL, 7LL, 8LL}), 0x5F), 7LL, 8LL, 3LL, 4LL)); +TEST_CONSTEXPR(match_m256i(_mm256_permute2f128_si256(((__m256i){1LL, 2LL, 3LL, 4LL}), ((__m256i){5LL, 6LL, 7LL, 8LL}), 0x37), 7LL, 8LL, 7LL, 8LL)); +TEST_CONSTEXPR(match_m256i(_mm256_permute2f128_si256(((__m256i){1LL, 2LL, 3LL, 4LL}), ((__m256i){5LL, 6LL, 7LL, 8LL}), 0x12), 5LL, 6LL, 3LL, 4LL)); + __m128d test_mm_permutevar_pd(__m128d A, __m128i B) { // CHECK-LABEL: test_mm_permutevar_pd // CHECK: call {{.*}}<2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %{{.*}}, <2 x i64> %{{.*}}) diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index c9474e94476fc..de3d92ea1c6cc 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -1106,6 +1106,11 @@ __m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) { return _mm256_permute2x128_si256(a, b, 0x38); } +TEST_CONSTEXPR(match_m256i(_mm256_permute2x128_si256(((__m256i){1LL, 2LL, 3LL, 4LL}), ((__m256i){5LL, 6LL, 7LL, 8LL}), 0xA7), 7LL, 8LL, 0LL, 0LL)); +TEST_CONSTEXPR(match_m256i(_mm256_permute2x128_si256(((__m256i){1LL, 2LL, 3LL, 4LL}), ((__m256i){5LL, 6LL, 7LL, 8LL}), 0x5F), 0LL, 0LL, 3LL, 4LL)); +TEST_CONSTEXPR(match_m256i(_mm256_permute2x128_si256(((__m256i){1LL, 2LL, 3LL, 4LL}), ((__m256i){5LL, 6LL, 7LL, 8LL}), 0x37), 7LL, 8LL, 7LL, 8LL)); +TEST_CONSTEXPR(match_m256i(_mm256_permute2x128_si256(((__m256i){1LL, 2LL, 3LL, 4LL}), ((__m256i){5LL, 6LL, 7LL, 8LL}), 0x12), 5LL, 6LL, 3LL, 4LL)); + __m256i test_mm256_permute4x64_epi64(__m256i a) { // CHECK-LABEL: test_mm256_permute4x64_epi64 // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <4 x i32>