|
| 1 | +// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir -target-feature +avx512f |
| 2 | +// RUN: FileCheck --input-file=%t.cir %s |
| 3 | + |
| 4 | +// Simple test file that doesn't require immintrin.h |
| 5 | +// Tests PSLLDQI byte shift intrinsics implementation in ClangIR |
| 6 | + |
| 7 | +typedef long long __m128i __attribute__((__vector_size__(16))); |
| 8 | +typedef long long __m256i __attribute__((__vector_size__(32))); |
| 9 | +typedef long long __m512i __attribute__((__vector_size__(64))); |
| 10 | + |
| 11 | +// Declare the builtins directly |
| 12 | +extern __m128i __builtin_ia32_pslldqi128_byteshift(__m128i, int); |
| 13 | +extern __m256i __builtin_ia32_pslldqi256_byteshift(__m256i, int); |
| 14 | +extern __m512i __builtin_ia32_pslldqi512_byteshift(__m512i, int); |
| 15 | + |
| 16 | +// ============================================================================ |
| 17 | +// 128-bit Tests (Single Lane) |
| 18 | +// ============================================================================ |
| 19 | + |
| 20 | +// CHECK-LABEL: @_Z22test_pslldqi128_shift4Dv2_x |
| 21 | +__m128i test_pslldqi128_shift4(__m128i a) { |
| 22 | + // Should shift left by 4 bytes, filling with zeros |
| 23 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>) [#cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i, #cir.int<16> : !s32i, #cir.int<17> : !s32i, #cir.int<18> : !s32i, #cir.int<19> : !s32i, #cir.int<20> : !s32i, #cir.int<21> : !s32i, #cir.int<22> : !s32i, #cir.int<23> : !s32i, #cir.int<24> : !s32i, #cir.int<25> : !s32i, #cir.int<26> : !s32i, #cir.int<27> : !s32i] : !cir.vector<!s8i x 16> |
| 24 | + return __builtin_ia32_pslldqi128_byteshift(a, 4); |
| 25 | +} |
| 26 | + |
| 27 | +// CHECK-LABEL: @_Z22test_pslldqi128_shift0Dv2_x |
| 28 | +__m128i test_pslldqi128_shift0(__m128i a) { |
| 29 | + // Should return input unchanged |
| 30 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>) [#cir.int<16> : !s32i, #cir.int<17> : !s32i, #cir.int<18> : !s32i, #cir.int<19> : !s32i, #cir.int<20> : !s32i, #cir.int<21> : !s32i, #cir.int<22> : !s32i, #cir.int<23> : !s32i, #cir.int<24> : !s32i, #cir.int<25> : !s32i, #cir.int<26> : !s32i, #cir.int<27> : !s32i, #cir.int<28> : !s32i, #cir.int<29> : !s32i, #cir.int<30> : !s32i, #cir.int<31> : !s32i] : !cir.vector<!s8i x 16> |
| 31 | + return __builtin_ia32_pslldqi128_byteshift(a, 0); |
| 32 | +} |
| 33 | + |
| 34 | +// CHECK-LABEL: @_Z22test_pslldqi128_shift8Dv2_x |
| 35 | +__m128i test_pslldqi128_shift8(__m128i a) { |
| 36 | + // Should shift left by 8 bytes (64 bits) |
| 37 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>) [#cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i, #cir.int<16> : !s32i, #cir.int<17> : !s32i, #cir.int<18> : !s32i, #cir.int<19> : !s32i, #cir.int<20> : !s32i, #cir.int<21> : !s32i, #cir.int<22> : !s32i, #cir.int<23> : !s32i] : !cir.vector<!s8i x 16> |
| 38 | + return __builtin_ia32_pslldqi128_byteshift(a, 8); |
| 39 | +} |
| 40 | + |
| 41 | +// CHECK-LABEL: @_Z23test_pslldqi128_shift15Dv2_x |
| 42 | +__m128i test_pslldqi128_shift15(__m128i a) { |
| 43 | + // Only 1 byte from input should remain |
| 44 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>) [#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i, #cir.int<16> : !s32i] : !cir.vector<!s8i x 16> |
| 45 | + return __builtin_ia32_pslldqi128_byteshift(a, 15); |
| 46 | +} |
| 47 | + |
| 48 | +// CHECK-LABEL: @_Z23test_pslldqi128_shift16Dv2_x |
| 49 | +__m128i test_pslldqi128_shift16(__m128i a) { |
| 50 | + // Entire vector shifted out, should return zero |
| 51 | + // CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 2> |
| 52 | + return __builtin_ia32_pslldqi128_byteshift(a, 16); |
| 53 | +} |
| 54 | + |
| 55 | +// CHECK-LABEL: @_Z23test_pslldqi128_shift20Dv2_x |
| 56 | +__m128i test_pslldqi128_shift20(__m128i a) { |
| 57 | + // Should also return zero |
| 58 | + // CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 2> |
| 59 | + return __builtin_ia32_pslldqi128_byteshift(a, 20); |
| 60 | +} |
| 61 | + |
| 62 | +// CHECK-LABEL: @_Z28test_pslldqi128_masked_shiftDv2_x |
| 63 | +__m128i test_pslldqi128_masked_shift(__m128i a) { |
| 64 | + // 250 > 16, so should return zero |
| 65 | + // CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 2> |
| 66 | + return __builtin_ia32_pslldqi128_byteshift(a, 250); |
| 67 | +} |
| 68 | + |
| 69 | +// ============================================================================ |
| 70 | +// 256-bit Tests (Two Independent Lanes) |
| 71 | +// ============================================================================ |
| 72 | + |
| 73 | +// CHECK-LABEL: @_Z22test_pslldqi256_shift4Dv4_x |
| 74 | +__m256i test_pslldqi256_shift4(__m256i a) { |
| 75 | + // Each 128-bit lane shifts independently |
| 76 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 32>) |
| 77 | + return __builtin_ia32_pslldqi256_byteshift(a, 4); |
| 78 | +} |
| 79 | + |
| 80 | +// CHECK-LABEL: @_Z22test_pslldqi256_shift0Dv4_x |
| 81 | +__m256i test_pslldqi256_shift0(__m256i a) { |
| 82 | + // Should return input unchanged |
| 83 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 32>) |
| 84 | + return __builtin_ia32_pslldqi256_byteshift(a, 0); |
| 85 | +} |
| 86 | + |
| 87 | +// CHECK-LABEL: @_Z22test_pslldqi256_shift8Dv4_x |
| 88 | +__m256i test_pslldqi256_shift8(__m256i a) { |
| 89 | + // Each lane shifts by 8 bytes independently |
| 90 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 32>) |
| 91 | + return __builtin_ia32_pslldqi256_byteshift(a, 8); |
| 92 | +} |
| 93 | + |
| 94 | +// Test shift by 12 (most of each lane) |
| 95 | +// CHECK-LABEL: @_Z23test_pslldqi256_shift12Dv4_x |
| 96 | +__m256i test_pslldqi256_shift12(__m256i a) { |
| 97 | + // Only 4 bytes remain in each lane |
| 98 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 32>) |
| 99 | + return __builtin_ia32_pslldqi256_byteshift(a, 12); |
| 100 | +} |
| 101 | + |
| 102 | +// Test shift by 15 (maximum valid) |
| 103 | +// CHECK-LABEL: @_Z23test_pslldqi256_shift15Dv4_x |
| 104 | +__m256i test_pslldqi256_shift15(__m256i a) { |
| 105 | + // Only 1 byte remains in each lane |
| 106 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 32>) |
| 107 | + return __builtin_ia32_pslldqi256_byteshift(a, 15); |
| 108 | +} |
| 109 | + |
| 110 | +// CHECK-LABEL: @_Z23test_pslldqi256_shift16Dv4_x |
| 111 | +__m256i test_pslldqi256_shift16(__m256i a) { |
| 112 | + // Both lanes completely shifted out, returns zero |
| 113 | + // CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 4> |
| 114 | + return __builtin_ia32_pslldqi256_byteshift(a, 16); |
| 115 | +} |
| 116 | + |
| 117 | +// CHECK-LABEL: @_Z23test_pslldqi256_shift32Dv4_x |
| 118 | +__m256i test_pslldqi256_shift32(__m256i a) { |
| 119 | + // Should return zero vector |
| 120 | + // CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 4> |
| 121 | + return __builtin_ia32_pslldqi256_byteshift(a, 32); |
| 122 | +} |
| 123 | + |
| 124 | +// ============================================================================ |
| 125 | +// 512-bit Tests (Four Independent Lanes) |
| 126 | +// ============================================================================ |
| 127 | + |
| 128 | +// CHECK-LABEL: @_Z22test_pslldqi512_shift4Dv8_x |
| 129 | +__m512i test_pslldqi512_shift4(__m512i a) { |
| 130 | + // All 4 lanes shift independently by 4 bytes |
| 131 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 64>) |
| 132 | + return __builtin_ia32_pslldqi512_byteshift(a, 4); |
| 133 | +} |
| 134 | + |
| 135 | +// CHECK-LABEL: @_Z22test_pslldqi512_shift0Dv8_x |
| 136 | +__m512i test_pslldqi512_shift0(__m512i a) { |
| 137 | + // Should return input unchanged |
| 138 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 64>) |
| 139 | + return __builtin_ia32_pslldqi512_byteshift(a, 0); |
| 140 | +} |
| 141 | + |
| 142 | +// Test shift by 8 |
| 143 | +// CHECK-LABEL: @_Z22test_pslldqi512_shift8Dv8_x |
| 144 | +__m512i test_pslldqi512_shift8(__m512i a) { |
| 145 | + // Each of 4 lanes shifts by 8 bytes |
| 146 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 64>) |
| 147 | + return __builtin_ia32_pslldqi512_byteshift(a, 8); |
| 148 | +} |
| 149 | + |
| 150 | +// Test shift by 15 (maximum valid) |
| 151 | +// CHECK-LABEL: @_Z23test_pslldqi512_shift15Dv8_x |
| 152 | +__m512i test_pslldqi512_shift15(__m512i a) { |
| 153 | + // Only 1 byte remains in each of the 4 lanes |
| 154 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 64>) |
| 155 | + return __builtin_ia32_pslldqi512_byteshift(a, 15); |
| 156 | +} |
| 157 | + |
| 158 | +// CHECK-LABEL: @_Z23test_pslldqi512_shift16Dv8_x |
| 159 | +__m512i test_pslldqi512_shift16(__m512i a) { |
| 160 | + // All 4 lanes completely cleared |
| 161 | + // CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 8> |
| 162 | + return __builtin_ia32_pslldqi512_byteshift(a, 16); |
| 163 | +} |
| 164 | + |
| 165 | +// CHECK-LABEL: @_Z23test_pslldqi512_shift64Dv8_x |
| 166 | +__m512i test_pslldqi512_shift64(__m512i a) { |
| 167 | + // Should return zero vector |
| 168 | + // CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 8> |
| 169 | + return __builtin_ia32_pslldqi512_byteshift(a, 64); |
| 170 | +} |
| 171 | + |
| 172 | +// Test with masked shift amount |
| 173 | +// CHECK-LABEL: @_Z28test_pslldqi512_masked_shiftDv8_x |
| 174 | +__m512i test_pslldqi512_masked_shift(__m512i a) { |
| 175 | + // 250 & 0xFF = 250, so should behave same as shift >= 16 (return zero) |
| 176 | + // CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 8> |
| 177 | + return __builtin_ia32_pslldqi512_byteshift(a, 250); |
| 178 | +} |
| 179 | + |
| 180 | +// ============================================================================ |
| 181 | +// Edge Cases and Special Scenarios |
| 182 | +// ============================================================================ |
| 183 | + |
| 184 | +// CHECK-LABEL: @_Z23test_consecutive_shiftsDv2_x |
| 185 | +__m128i test_consecutive_shifts(__m128i a) { |
| 186 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>) |
| 187 | + __m128i tmp1 = __builtin_ia32_pslldqi128_byteshift(a, 2); |
| 188 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>) |
| 189 | + __m128i tmp2 = __builtin_ia32_pslldqi128_byteshift(tmp1, 3); |
| 190 | + // Total shift of 5 bytes |
| 191 | + return tmp2; |
| 192 | +} |
| 193 | + |
| 194 | +// CHECK-LABEL: @_Z21test_const_expr_shiftDv2_x |
| 195 | +__m128i test_const_expr_shift(__m128i a) { |
| 196 | + const int shift_amount = 3 + 4; // = 7 |
| 197 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>) |
| 198 | + return __builtin_ia32_pslldqi128_byteshift(a, shift_amount); |
| 199 | +} |
| 200 | + |
| 201 | +// CHECK-LABEL: @_Z22test_lane_independenceDv4_xPS_S0_ |
| 202 | +void test_lane_independence(__m256i a, __m256i* result1, __m256i* result2) { |
| 203 | + // Different shift amounts to show each lane operates independently |
| 204 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 32>) |
| 205 | + *result1 = __builtin_ia32_pslldqi256_byteshift(a, 4); |
| 206 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 32>) |
| 207 | + *result2 = __builtin_ia32_pslldqi256_byteshift(a, 8); |
| 208 | + // The two 128-bit lanes in each result shift independently |
| 209 | +} |
| 210 | + |
| 211 | +// CHECK-LABEL: @_Z22test_pslldqi128_shift1Dv2_x |
| 212 | +__m128i test_pslldqi128_shift1(__m128i a) { |
| 213 | + // Shifts by just 1 byte |
| 214 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>) |
| 215 | + return __builtin_ia32_pslldqi128_byteshift(a, 1); |
| 216 | +} |
| 217 | + |
| 218 | +// Test boundary case: shift by 14 |
| 219 | +// CHECK-LABEL: @_Z23test_pslldqi128_shift14Dv2_x |
| 220 | +__m128i test_pslldqi128_shift14(__m128i a) { |
| 221 | + // 2 bytes remain |
| 222 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>) |
| 223 | + return __builtin_ia32_pslldqi128_byteshift(a, 14); |
| 224 | +} |
| 225 | + |
| 226 | +// ============================================================================ |
| 227 | +// Pattern Tests - Verify the shuffle indices work correctly |
| 228 | +// ============================================================================ |
| 229 | + |
| 230 | +// These tests help verify the shuffle index calculation |
| 231 | + |
| 232 | +// Test that verifies zeros are inserted from the left |
| 233 | +// CHECK-LABEL: @_Z19test_zero_insertionDv2_x |
| 234 | +__m128i test_zero_insertion(__m128i a) { |
| 235 | + // After shift by 4, first 4 bytes should be zero |
| 236 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>) |
| 237 | + return __builtin_ia32_pslldqi128_byteshift(a, 4); |
| 238 | +} |
| 239 | + |
| 240 | +// Test all three sizes with same shift to compare behavior |
| 241 | +// CHECK-LABEL: @_Z21test_all_sizes_shift4Dv2_xDv4_xDv8_xPS_PS0_PS1_ |
| 242 | +void test_all_sizes_shift4(__m128i a128, __m256i a256, __m512i a512, |
| 243 | + __m128i* r128, __m256i* r256, __m512i* r512) { |
| 244 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 16>) |
| 245 | + *r128 = __builtin_ia32_pslldqi128_byteshift(a128, 4); |
| 246 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 32>) |
| 247 | + *r256 = __builtin_ia32_pslldqi256_byteshift(a256, 4); |
| 248 | + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!s8i x 64>) |
| 249 | + *r512 = __builtin_ia32_pslldqi512_byteshift(a512, 4); |
| 250 | + // Each should shift their lane(s) by 4 bytes |
| 251 | +} |
| 252 | + |
| 253 | +// CHECK-LABEL: @_Z22test_large_shift_valueDv2_x |
| 254 | +__m128i test_large_shift_value(__m128i a) { |
| 255 | + // 240 & 0xFF = 240, so this should return zero (240 > 16) |
| 256 | + // CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 2> |
| 257 | + return __builtin_ia32_pslldqi128_byteshift(a, 240); |
| 258 | +} |
| 259 | + |
| 260 | +// CHECK-LABEL: @_Z26test_large_shift_value_256Dv4_x |
| 261 | +__m256i test_large_shift_value_256(__m256i a) { |
| 262 | + // 244 & 0xFF = 244, so this should return zero (244 > 16) |
| 263 | + // CHECK: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 4> |
| 264 | + return __builtin_ia32_pslldqi256_byteshift(a, 244); |
| 265 | +} |
0 commit comments