Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -462,15 +462,19 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
def cvttpd2dq256 : X86Builtin<"_Vector<4, int>(_Vector<4, double>)">;
def cvtpd2dq256 : X86Builtin<"_Vector<4, int>(_Vector<4, double>)">;
def cvttps2dq256 : X86Builtin<"_Vector<8, int>(_Vector<8, float>)">;
def vperm2f128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
def vperm2f128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
def vperm2f128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
foreach Op = ["max", "min"] in {
def Op#pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>)">;
def Op#ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>)">;
}
}

let Features = "avx",
Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def vperm2f128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
def vperm2f128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
def vperm2f128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
}

let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def vpermilpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int)">;
def vpermilps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
Expand Down Expand Up @@ -567,14 +571,14 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
def psadbw256
: X86Builtin<
"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
def permti256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
}

let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def permdf256
: X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long "
"int>, _Constant int)">;
def permti256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
def pavgb256 : X86Builtin<"_Vector<32, unsigned char>(_Vector<32, unsigned char>, _Vector<32, unsigned char>)">;
def pavgw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
Expand Down
23 changes: 23 additions & 0 deletions clang/lib/AST/ByteCode/InterpBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5101,6 +5101,29 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
unsigned SrcIdx = (ShuffleMask >> 6) & 0x1;
return std::pair<unsigned, int>{SrcIdx, Offset};
});
case X86::BI__builtin_ia32_vperm2f128_pd256:
case X86::BI__builtin_ia32_vperm2f128_ps256:
case X86::BI__builtin_ia32_vperm2f128_si256:
case X86::BI__builtin_ia32_permti256:
return interp__builtin_ia32_shuffle_generic(
S, OpPC, Call,
[BuiltinID, Call](unsigned DstIdx, unsigned ShuffleMask) {
const auto NumElements =
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The local variables shouldn't be const and they shouldn't use auto either.

Call->getArg(0)->getType()->getAs<VectorType>()->getNumElements();
const auto PreservedBitsCnt = NumElements >> 2;
const auto ControlBitsCnt = DstIdx >> PreservedBitsCnt << 2;
const auto ControlBits = ShuffleMask >> ControlBitsCnt;

if (BuiltinID == X86::BI__builtin_ia32_permti256 &&
(ControlBits & 0b1000))
return std::make_pair(0u, -1);

const auto SrcVecIdx = (ControlBits & 0b10) >> 1;
const auto PreservedBitsMask = (1 << PreservedBitsCnt) - 1;
const int SrcIdx = ((ControlBits & 0b1) << PreservedBitsCnt) |
(DstIdx & PreservedBitsMask);
return std::make_pair(SrcVecIdx, SrcIdx);
});
case X86::BI__builtin_ia32_pshufb128:
case X86::BI__builtin_ia32_pshufb256:
case X86::BI__builtin_ia32_pshufb512:
Expand Down
28 changes: 28 additions & 0 deletions clang/lib/AST/ExprConstant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14330,6 +14330,34 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {

return Success(ResultElements, E);
}
case X86::BI__builtin_ia32_vperm2f128_pd256:
case X86::BI__builtin_ia32_vperm2f128_ps256:
case X86::BI__builtin_ia32_vperm2f128_si256:
case X86::BI__builtin_ia32_permti256: {
APValue R;
if (!evalShuffleGeneric(
Info, E, R, [E](unsigned DstIdx, unsigned ShuffleMask) {
const auto NumElements = E->getArg(0)
->getType()
->getAs<VectorType>()
->getNumElements();
const auto PreservedBitsCnt = NumElements >> 2;
const auto ControlBitsCnt = DstIdx >> PreservedBitsCnt << 2;
const auto ControlBits = ShuffleMask >> ControlBitsCnt;

if (E->getBuiltinCallee() == X86::BI__builtin_ia32_permti256 &&
(ControlBits & 0b1000))
return std::make_pair(0u, -1);

const auto SrcVecIdx = (ControlBits & 0b10) >> 1;
const auto PreservedBitsMask = (1 << PreservedBitsCnt) - 1;
const int SrcIdx = ((ControlBits & 0b1) << PreservedBitsCnt) |
(DstIdx & PreservedBitsMask);
return std::make_pair(SrcVecIdx, SrcIdx);
}))
return false;
return Success(R, E);
}
}
}

Expand Down
15 changes: 15 additions & 0 deletions clang/test/CodeGen/X86/avx-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -1439,18 +1439,33 @@ __m256d test_mm256_permute2f128_pd(__m256d A, __m256d B) {
return _mm256_permute2f128_pd(A, B, 0x31);
}

TEST_CONSTEXPR(match_m256d(_mm256_permute2f128_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0xA7), 7.0, 8.0, 5.0, 6.0));
TEST_CONSTEXPR(match_m256d(_mm256_permute2f128_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0x5F), 7.0, 8.0, 3.0, 4.0));
TEST_CONSTEXPR(match_m256d(_mm256_permute2f128_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0x37), 7.0, 8.0, 7.0, 8.0));
TEST_CONSTEXPR(match_m256d(_mm256_permute2f128_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0x12), 5.0, 6.0, 3.0, 4.0));

__m256 test_mm256_permute2f128_ps(__m256 A, __m256 B) {
// CHECK-LABEL: test_mm256_permute2f128_ps
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
return _mm256_permute2f128_ps(A, B, 0x13);
}

TEST_CONSTEXPR(match_m256(_mm256_permute2f128_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), 0xA7), 13.0f, 14.0f, 15.0f, 16.0f, 9.0f, 10.0f, 11.0f, 12.0f));
TEST_CONSTEXPR(match_m256(_mm256_permute2f128_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), 0x5F), 13.0f, 14.0f, 15.0f, 16.0f, 5.0f, 6.0f, 7.0f, 8.0f));
TEST_CONSTEXPR(match_m256(_mm256_permute2f128_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), 0x37), 13.0f, 14.0f, 15.0f, 16.0f, 13.0f, 14.0f, 15.0f, 16.0f));
TEST_CONSTEXPR(match_m256(_mm256_permute2f128_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), 0x12), 9.0f, 10.0f, 11.0f, 12.0f, 5.0f, 6.0f, 7.0f, 8.0f));

__m256i test_mm256_permute2f128_si256(__m256i A, __m256i B) {
// CHECK-LABEL: test_mm256_permute2f128_si256
// CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
return _mm256_permute2f128_si256(A, B, 0x20);
}

TEST_CONSTEXPR(match_m256i(_mm256_permute2f128_si256(((__m256i){1LL, 2LL, 3LL, 4LL}), ((__m256i){5LL, 6LL, 7LL, 8LL}), 0xA7), 7LL, 8LL, 5LL, 6LL));
TEST_CONSTEXPR(match_m256i(_mm256_permute2f128_si256(((__m256i){1LL, 2LL, 3LL, 4LL}), ((__m256i){5LL, 6LL, 7LL, 8LL}), 0x5F), 7LL, 8LL, 3LL, 4LL));
TEST_CONSTEXPR(match_m256i(_mm256_permute2f128_si256(((__m256i){1LL, 2LL, 3LL, 4LL}), ((__m256i){5LL, 6LL, 7LL, 8LL}), 0x37), 7LL, 8LL, 7LL, 8LL));
TEST_CONSTEXPR(match_m256i(_mm256_permute2f128_si256(((__m256i){1LL, 2LL, 3LL, 4LL}), ((__m256i){5LL, 6LL, 7LL, 8LL}), 0x12), 5LL, 6LL, 3LL, 4LL));

__m128d test_mm_permutevar_pd(__m128d A, __m128i B) {
// CHECK-LABEL: test_mm_permutevar_pd
// CHECK: call {{.*}}<2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %{{.*}}, <2 x i64> %{{.*}})
Expand Down
5 changes: 5 additions & 0 deletions clang/test/CodeGen/X86/avx2-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -1106,6 +1106,11 @@ __m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {
return _mm256_permute2x128_si256(a, b, 0x38);
}

TEST_CONSTEXPR(match_m256i(_mm256_permute2x128_si256(((__m256i){1LL, 2LL, 3LL, 4LL}), ((__m256i){5LL, 6LL, 7LL, 8LL}), 0xA7), 7LL, 8LL, 0LL, 0LL));
TEST_CONSTEXPR(match_m256i(_mm256_permute2x128_si256(((__m256i){1LL, 2LL, 3LL, 4LL}), ((__m256i){5LL, 6LL, 7LL, 8LL}), 0x5F), 0LL, 0LL, 3LL, 4LL));
TEST_CONSTEXPR(match_m256i(_mm256_permute2x128_si256(((__m256i){1LL, 2LL, 3LL, 4LL}), ((__m256i){5LL, 6LL, 7LL, 8LL}), 0x37), 7LL, 8LL, 7LL, 8LL));
TEST_CONSTEXPR(match_m256i(_mm256_permute2x128_si256(((__m256i){1LL, 2LL, 3LL, 4LL}), ((__m256i){5LL, 6LL, 7LL, 8LL}), 0x12), 5LL, 6LL, 3LL, 4LL));

__m256i test_mm256_permute4x64_epi64(__m256i a) {
// CHECK-LABEL: test_mm256_permute4x64_epi64
// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
Expand Down