Skip to content

Commit 057fd70

Browse files
[CIR] Added support for pslldqi
1 parent 3d04a3d commit 057fd70

File tree

2 files changed

+304
-1
lines changed

2 files changed

+304
-1
lines changed

clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,44 @@ static mlir::Value emitX86SExtMask(CIRGenFunction &cgf, mlir::Value op,
158158
return cgf.getBuilder().createCast(loc, cir::CastKind::integral, mask, dstTy);
159159
}
160160

161+
static mlir::Value emitX86PSLLDQIByteShift(CIRGenFunction &cgf,
162+
const CallExpr *E,
163+
ArrayRef<mlir::Value> Ops) {
164+
165+
auto &builder = cgf.getBuilder();
166+
auto loc = cgf.getLoc(E->getExprLoc());
167+
168+
unsigned shiftVal = getIntValueFromConstOp(Ops[1]) & 0xff;
169+
auto resultType = cast<cir::VectorType>(Ops[0].getType());
170+
171+
unsigned numElts = resultType.getSize() * 8;
172+
if (shiftVal >= 16)
173+
return builder.getZero(loc, resultType);
174+
175+
llvm::SmallVector<int64_t, 64> indices;
176+
177+
for (unsigned l = 0; l != numElts; l += 16) {
178+
for (unsigned i = 0; i != 16; ++i) {
179+
unsigned idx = numElts + i - shiftVal;
180+
if (idx < numElts)
181+
idx -= numElts - 16;
182+
indices.push_back(idx + l);
183+
}
184+
}
185+
186+
// Cast to byte vector for shuffle operation
187+
auto byteVecTy = cir::VectorType::get(builder.getSInt8Ty(), numElts);
188+
mlir::Value byteCast = builder.createBitcast(Ops[0], byteVecTy);
189+
mlir::Value zero = builder.getZero(loc, byteVecTy);
190+
191+
// Perform the shuffle (left shift by inserting zeros)
192+
mlir::Value shuffleResult =
193+
builder.createVecShuffle(loc, zero, byteCast, indices);
194+
195+
// Cast back to original type
196+
return builder.createBitcast(shuffleResult, resultType);
197+
}
198+
161199
mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned BuiltinID,
162200
const CallExpr *E) {
163201
if (BuiltinID == Builtin::BI__builtin_cpu_is)
@@ -1119,7 +1157,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned BuiltinID,
11191157
case X86::BI__builtin_ia32_pslldqi128_byteshift:
11201158
case X86::BI__builtin_ia32_pslldqi256_byteshift:
11211159
case X86::BI__builtin_ia32_pslldqi512_byteshift:
1122-
llvm_unreachable("pslldqi NYI");
1160+
return emitX86PSLLDQIByteShift(*this, E, Ops);
11231161
case X86::BI__builtin_ia32_psrldqi128_byteshift:
11241162
case X86::BI__builtin_ia32_psrldqi256_byteshift:
11251163
case X86::BI__builtin_ia32_psrldqi512_byteshift:
Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir -target-feature +avx512f
2+
// RUN: FileCheck --input-file=%t.cir %s
3+
4+
// Simple test file that doesn't require immintrin.h
5+
// Tests PSLLDQI byte shift intrinsics implementation in ClangIR
6+
7+
typedef long long __m128i __attribute__((__vector_size__(16)));
8+
typedef long long __m256i __attribute__((__vector_size__(32)));
9+
typedef long long __m512i __attribute__((__vector_size__(64)));
10+
11+
// Declare the builtins directly
12+
extern __m128i __builtin_ia32_pslldqi128_byteshift(__m128i, int);
13+
extern __m256i __builtin_ia32_pslldqi256_byteshift(__m256i, int);
14+
extern __m512i __builtin_ia32_pslldqi512_byteshift(__m512i, int);
15+
16+
// ============================================================================
17+
// 128-bit Tests (Single Lane)
18+
// ============================================================================
19+
20+
// CHECK-LABEL: @_Z22test_pslldqi128_shift4Dv2_x
21+
__m128i test_pslldqi128_shift4(__m128i a) {
22+
// Should shift left by 4 bytes, filling with zeros
23+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>) [#cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i, #cir.int<16> : !s32i, #cir.int<17> : !s32i, #cir.int<18> : !s32i, #cir.int<19> : !s32i, #cir.int<20> : !s32i, #cir.int<21> : !s32i, #cir.int<22> : !s32i, #cir.int<23> : !s32i, #cir.int<24> : !s32i, #cir.int<25> : !s32i, #cir.int<26> : !s32i, #cir.int<27> : !s32i] : !cir.vector<!s8i x 16>
24+
return __builtin_ia32_pslldqi128_byteshift(a, 4);
25+
}
26+
27+
// CHECK-LABEL: @_Z22test_pslldqi128_shift0Dv2_x
28+
__m128i test_pslldqi128_shift0(__m128i a) {
29+
// Should return input unchanged
30+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>) [#cir.int<16> : !s32i, #cir.int<17> : !s32i, #cir.int<18> : !s32i, #cir.int<19> : !s32i, #cir.int<20> : !s32i, #cir.int<21> : !s32i, #cir.int<22> : !s32i, #cir.int<23> : !s32i, #cir.int<24> : !s32i, #cir.int<25> : !s32i, #cir.int<26> : !s32i, #cir.int<27> : !s32i, #cir.int<28> : !s32i, #cir.int<29> : !s32i, #cir.int<30> : !s32i, #cir.int<31> : !s32i] : !cir.vector<!s8i x 16>
31+
return __builtin_ia32_pslldqi128_byteshift(a, 0);
32+
}
33+
34+
// CHECK-LABEL: @_Z22test_pslldqi128_shift8Dv2_x
35+
__m128i test_pslldqi128_shift8(__m128i a) {
36+
// Should shift left by 8 bytes (64 bits)
37+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>) [#cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i, #cir.int<16> : !s32i, #cir.int<17> : !s32i, #cir.int<18> : !s32i, #cir.int<19> : !s32i, #cir.int<20> : !s32i, #cir.int<21> : !s32i, #cir.int<22> : !s32i, #cir.int<23> : !s32i] : !cir.vector<!s8i x 16>
38+
return __builtin_ia32_pslldqi128_byteshift(a, 8);
39+
}
40+
41+
// CHECK-LABEL: @_Z23test_pslldqi128_shift15Dv2_x
42+
__m128i test_pslldqi128_shift15(__m128i a) {
43+
// Only 1 byte from input should remain
44+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>) [#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i, #cir.int<16> : !s32i] : !cir.vector<!s8i x 16>
45+
return __builtin_ia32_pslldqi128_byteshift(a, 15);
46+
}
47+
48+
// CHECK-LABEL: @_Z23test_pslldqi128_shift16Dv2_x
49+
__m128i test_pslldqi128_shift16(__m128i a) {
50+
// Entire vector shifted out, should return zero
51+
// CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 2>
52+
return __builtin_ia32_pslldqi128_byteshift(a, 16);
53+
}
54+
55+
// CHECK-LABEL: @_Z23test_pslldqi128_shift20Dv2_x
56+
__m128i test_pslldqi128_shift20(__m128i a) {
57+
// Should also return zero
58+
// CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 2>
59+
return __builtin_ia32_pslldqi128_byteshift(a, 20);
60+
}
61+
62+
// CHECK-LABEL: @_Z28test_pslldqi128_masked_shiftDv2_x
63+
__m128i test_pslldqi128_masked_shift(__m128i a) {
64+
// 250 > 16, so should return zero
65+
// CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 2>
66+
return __builtin_ia32_pslldqi128_byteshift(a, 250);
67+
}
68+
69+
// ============================================================================
70+
// 256-bit Tests (Two Independent Lanes)
71+
// ============================================================================
72+
73+
// CHECK-LABEL: @_Z22test_pslldqi256_shift4Dv4_x
74+
__m256i test_pslldqi256_shift4(__m256i a) {
75+
// Each 128-bit lane shifts independently
76+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 32>)
77+
return __builtin_ia32_pslldqi256_byteshift(a, 4);
78+
}
79+
80+
// CHECK-LABEL: @_Z22test_pslldqi256_shift0Dv4_x
81+
__m256i test_pslldqi256_shift0(__m256i a) {
82+
// Should return input unchanged
83+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 32>)
84+
return __builtin_ia32_pslldqi256_byteshift(a, 0);
85+
}
86+
87+
// CHECK-LABEL: @_Z22test_pslldqi256_shift8Dv4_x
88+
__m256i test_pslldqi256_shift8(__m256i a) {
89+
// Each lane shifts by 8 bytes independently
90+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 32>)
91+
return __builtin_ia32_pslldqi256_byteshift(a, 8);
92+
}
93+
94+
// Test shift by 12 (most of each lane)
95+
// CHECK-LABEL: @_Z23test_pslldqi256_shift12Dv4_x
96+
__m256i test_pslldqi256_shift12(__m256i a) {
97+
// Only 4 bytes remain in each lane
98+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 32>)
99+
return __builtin_ia32_pslldqi256_byteshift(a, 12);
100+
}
101+
102+
// Test shift by 15 (maximum valid)
103+
// CHECK-LABEL: @_Z23test_pslldqi256_shift15Dv4_x
104+
__m256i test_pslldqi256_shift15(__m256i a) {
105+
// Only 1 byte remains in each lane
106+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 32>)
107+
return __builtin_ia32_pslldqi256_byteshift(a, 15);
108+
}
109+
110+
// CHECK-LABEL: @_Z23test_pslldqi256_shift16Dv4_x
111+
__m256i test_pslldqi256_shift16(__m256i a) {
112+
// Both lanes completely shifted out, returns zero
113+
// CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 4>
114+
return __builtin_ia32_pslldqi256_byteshift(a, 16);
115+
}
116+
117+
// CHECK-LABEL: @_Z23test_pslldqi256_shift32Dv4_x
118+
__m256i test_pslldqi256_shift32(__m256i a) {
119+
// Should return zero vector
120+
// CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 4>
121+
return __builtin_ia32_pslldqi256_byteshift(a, 32);
122+
}
123+
124+
// ============================================================================
125+
// 512-bit Tests (Four Independent Lanes)
126+
// ============================================================================
127+
128+
// CHECK-LABEL: @_Z22test_pslldqi512_shift4Dv8_x
129+
__m512i test_pslldqi512_shift4(__m512i a) {
130+
// All 4 lanes shift independently by 4 bytes
131+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 64>)
132+
return __builtin_ia32_pslldqi512_byteshift(a, 4);
133+
}
134+
135+
// CHECK-LABEL: @_Z22test_pslldqi512_shift0Dv8_x
136+
__m512i test_pslldqi512_shift0(__m512i a) {
137+
// Should return input unchanged
138+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 64>)
139+
return __builtin_ia32_pslldqi512_byteshift(a, 0);
140+
}
141+
142+
// Test shift by 8
143+
// CHECK-LABEL: @_Z22test_pslldqi512_shift8Dv8_x
144+
__m512i test_pslldqi512_shift8(__m512i a) {
145+
// Each of 4 lanes shifts by 8 bytes
146+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 64>)
147+
return __builtin_ia32_pslldqi512_byteshift(a, 8);
148+
}
149+
150+
// Test shift by 15 (maximum valid)
151+
// CHECK-LABEL: @_Z23test_pslldqi512_shift15Dv8_x
152+
__m512i test_pslldqi512_shift15(__m512i a) {
153+
// Only 1 byte remains in each of the 4 lanes
154+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 64>)
155+
return __builtin_ia32_pslldqi512_byteshift(a, 15);
156+
}
157+
158+
// CHECK-LABEL: @_Z23test_pslldqi512_shift16Dv8_x
159+
__m512i test_pslldqi512_shift16(__m512i a) {
160+
// All 4 lanes completely cleared
161+
// CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 8>
162+
return __builtin_ia32_pslldqi512_byteshift(a, 16);
163+
}
164+
165+
// CHECK-LABEL: @_Z23test_pslldqi512_shift64Dv8_x
166+
__m512i test_pslldqi512_shift64(__m512i a) {
167+
// Should return zero vector
168+
// CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 8>
169+
return __builtin_ia32_pslldqi512_byteshift(a, 64);
170+
}
171+
172+
// Test with masked shift amount
173+
// CHECK-LABEL: @_Z28test_pslldqi512_masked_shiftDv8_x
174+
__m512i test_pslldqi512_masked_shift(__m512i a) {
175+
// 250 & 0xFF = 250, so should behave same as shift >= 16 (return zero)
176+
// CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 8>
177+
return __builtin_ia32_pslldqi512_byteshift(a, 250);
178+
}
179+
180+
// ============================================================================
181+
// Edge Cases and Special Scenarios
182+
// ============================================================================
183+
184+
// CHECK-LABEL: @_Z23test_consecutive_shiftsDv2_x
185+
__m128i test_consecutive_shifts(__m128i a) {
186+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>)
187+
__m128i tmp1 = __builtin_ia32_pslldqi128_byteshift(a, 2);
188+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>)
189+
__m128i tmp2 = __builtin_ia32_pslldqi128_byteshift(tmp1, 3);
190+
// Total shift of 5 bytes
191+
return tmp2;
192+
}
193+
194+
// CHECK-LABEL: @_Z21test_const_expr_shiftDv2_x
195+
__m128i test_const_expr_shift(__m128i a) {
196+
const int shift_amount = 3 + 4; // = 7
197+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>)
198+
return __builtin_ia32_pslldqi128_byteshift(a, shift_amount);
199+
}
200+
201+
// CHECK-LABEL: @_Z22test_lane_independenceDv4_xPS_S0_
202+
void test_lane_independence(__m256i a, __m256i* result1, __m256i* result2) {
203+
// Different shift amounts to show each lane operates independently
204+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 32>)
205+
*result1 = __builtin_ia32_pslldqi256_byteshift(a, 4);
206+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 32>)
207+
*result2 = __builtin_ia32_pslldqi256_byteshift(a, 8);
208+
// The two 128-bit lanes in each result shift independently
209+
}
210+
211+
// CHECK-LABEL: @_Z22test_pslldqi128_shift1Dv2_x
212+
__m128i test_pslldqi128_shift1(__m128i a) {
213+
// Shifts by just 1 byte
214+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>)
215+
return __builtin_ia32_pslldqi128_byteshift(a, 1);
216+
}
217+
218+
// Test boundary case: shift by 14
219+
// CHECK-LABEL: @_Z23test_pslldqi128_shift14Dv2_x
220+
__m128i test_pslldqi128_shift14(__m128i a) {
221+
// 2 bytes remain
222+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>)
223+
return __builtin_ia32_pslldqi128_byteshift(a, 14);
224+
}
225+
226+
// ============================================================================
227+
// Pattern Tests - Verify the shuffle indices work correctly
228+
// ============================================================================
229+
230+
// These tests help verify the shuffle index calculation
231+
232+
// Test that verifies zeros are inserted from the left
233+
// CHECK-LABEL: @_Z19test_zero_insertionDv2_x
234+
__m128i test_zero_insertion(__m128i a) {
235+
// After shift by 4, first 4 bytes should be zero
236+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>)
237+
return __builtin_ia32_pslldqi128_byteshift(a, 4);
238+
}
239+
240+
// Test all three sizes with same shift to compare behavior
241+
// CHECK-LABEL: @_Z21test_all_sizes_shift4Dv2_xDv4_xDv8_xPS_PS0_PS1_
242+
void test_all_sizes_shift4(__m128i a128, __m256i a256, __m512i a512,
243+
__m128i* r128, __m256i* r256, __m512i* r512) {
244+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>)
245+
*r128 = __builtin_ia32_pslldqi128_byteshift(a128, 4);
246+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 32>)
247+
*r256 = __builtin_ia32_pslldqi256_byteshift(a256, 4);
248+
// CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 64>)
249+
*r512 = __builtin_ia32_pslldqi512_byteshift(a512, 4);
250+
// Each should shift their lane(s) by 4 bytes
251+
}
252+
253+
// CHECK-LABEL: @_Z22test_large_shift_valueDv2_x
254+
__m128i test_large_shift_value(__m128i a) {
255+
// 240 & 0xFF = 240, so this should return zero (240 > 16)
256+
// CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 2>
257+
return __builtin_ia32_pslldqi128_byteshift(a, 240);
258+
}
259+
260+
// CHECK-LABEL: @_Z26test_large_shift_value_256Dv4_x
261+
__m256i test_large_shift_value_256(__m256i a) {
262+
// 244 & 0xFF = 244, so this should return zero (244 > 16)
263+
// CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 4>
264+
return __builtin_ia32_pslldqi256_byteshift(a, 244);
265+
}

0 commit comments

Comments
 (0)