Skip to content

Commit f9c2a34

Browse files
committed
[SROA] Create additional vector type candidates based on store and load slices
Second try at A-Wadhwani's https://reviews.llvm.org/D132096, which was reverted. The original patch had three issues: * https://reviews.llvm.org/D134032, which bjope kindly fixed. That patch is merged into this one. * [GHI #57796](#57796). Fixed and added a test. * [GHI #57821](#57821). I believe this is an undefined behavior which is not the fault of the original patch. Please see the issue for more details. Original diff summary: This patch adds additional vector types to be considered when doing promotion in SROA, based on the types of the store and load slices. This provides more promotion opportunities, by potentially using an optimal "intermediate" vector type. For example, the following code would currently not be promoted to a vector, since `__m128i` is a `<2 x i64>` vector. ``` __m128i packfoo0(int a, int b, int c, int d) { int r[4] = {a, b, c, d}; __m128i rm; std::memcpy(&rm, r, sizeof(rm)); return rm; } ``` ``` packfoo0(int, int, int, int): mov dword ptr [rsp - 24], edi mov dword ptr [rsp - 20], esi mov dword ptr [rsp - 16], edx mov dword ptr [rsp - 12], ecx movaps xmm0, xmmword ptr [rsp - 24] ret ``` By also considering the types of the elements, we could find that the `<4 x i32>` type would be valid for promotion, hence removing the memory accesses for this function. In other words, we can explore other new vector types, with the same size but different element types based on the load and store instructions from the Slices, which can provide us more promotion opportunities. Additionally, the step for removing duplicate elements from the `CandidateTys` vector was not using an equality comparator, which has been fixed. Differential Revision: https://reviews.llvm.org/D143225
1 parent d2b768b commit f9c2a34

File tree

4 files changed

+132
-70
lines changed

4 files changed

+132
-70
lines changed

llvm/lib/Transforms/Scalar/SROA.cpp

+31-8
Original file line numberDiff line numberDiff line change
@@ -1976,6 +1976,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
19761976
// Collect the candidate types for vector-based promotion. Also track whether
19771977
// we have different element types.
19781978
SmallVector<VectorType *, 4> CandidateTys;
1979+
SetVector<Type *> LoadStoreTys;
19791980
Type *CommonEltTy = nullptr;
19801981
VectorType *CommonVecPtrTy = nullptr;
19811982
bool HaveVecPtrTy = false;
@@ -2009,15 +2010,37 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
20092010
}
20102011
}
20112012
};
2012-
// Consider any loads or stores that are the exact size of the slice.
2013-
for (const Slice &S : P)
2014-
if (S.beginOffset() == P.beginOffset() &&
2015-
S.endOffset() == P.endOffset()) {
2016-
if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
2017-
CheckCandidateType(LI->getType());
2018-
else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
2019-
CheckCandidateType(SI->getValueOperand()->getType());
2013+
// Put load and store types into a set for de-duplication.
2014+
for (const Slice &S : P) {
2015+
Type *Ty;
2016+
if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
2017+
Ty = LI->getType();
2018+
else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
2019+
Ty = SI->getValueOperand()->getType();
2020+
else
2021+
continue;
2022+
LoadStoreTys.insert(Ty);
2023+
// Consider any loads or stores that are the exact size of the slice.
2024+
if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2025+
CheckCandidateType(Ty);
2026+
}
2027+
// Consider additional vector types where the element type size is a
2028+
// multiple of load/store element size.
2029+
for (Type *Ty : LoadStoreTys) {
2030+
if (!VectorType::isValidElementType(Ty))
2031+
continue;
2032+
unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2033+
for (VectorType *&VTy : CandidateTys) {
2034+
unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
2035+
unsigned ElementSize =
2036+
DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2037+
if (TypeSize != VectorSize && TypeSize != ElementSize &&
2038+
VectorSize % TypeSize == 0) {
2039+
VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false);
2040+
CheckCandidateType(NewVTy);
2041+
}
20202042
}
2043+
}
20212044

20222045
// If we didn't find a vector type, nothing to do here.
20232046
if (CandidateTys.empty())

llvm/test/Transforms/SROA/pr57796.ll

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt < %s -passes='sroa<preserve-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
3+
; RUN: opt < %s -passes='sroa<modify-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
4+
5+
%struct.Value = type { %union.anon }
6+
%union.anon = type { <32 x i8> }
7+
8+
@A = dso_local global i64 0, align 8
9+
10+
; Make sure that sroa does not crash when dealing with an invalid vector
11+
; element type.
12+
define void @foo() {
13+
; CHECK-LABEL: @foo(
14+
; CHECK-NEXT: entry:
15+
; CHECK-NEXT: [[REF_TMP_I:%.*]] = alloca [[STRUCT_VALUE:%.*]], align 32
16+
; CHECK-NEXT: call void @value_create(ptr sret([[STRUCT_VALUE]]) align 32 [[REF_TMP_I]])
17+
; CHECK-NEXT: [[CALL_I:%.*]] = call align 32 ptr @value_set_type(ptr align 32 [[REF_TMP_I]])
18+
; CHECK-NEXT: [[TMP0:%.*]] = load <32 x i8>, ptr [[CALL_I]], align 32
19+
; CHECK-NEXT: [[REF_TMP_SROA_0_0_VEC_EXTRACT:%.*]] = shufflevector <32 x i8> [[TMP0]], <32 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
20+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[REF_TMP_SROA_0_0_VEC_EXTRACT]] to x86_mmx
21+
; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 0)
22+
; CHECK-NEXT: store x86_mmx [[TMP2]], ptr @A, align 8
23+
; CHECK-NEXT: ret void
24+
;
25+
entry:
26+
%ref.tmp.i = alloca %struct.Value, align 32
27+
%ref.tmp = alloca %struct.Value, align 32
28+
call void @value_create(ptr sret(%struct.Value) align 32 %ref.tmp.i)
29+
%call.i = call align 32 ptr @value_set_type(ptr align 32 %ref.tmp.i)
30+
%0 = load <32 x i8>, ptr %call.i, align 32
31+
store <32 x i8> %0, ptr %ref.tmp, align 32
32+
%1 = load x86_mmx, ptr %ref.tmp, align 32
33+
%2 = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 0)
34+
store x86_mmx %2, ptr @A, align 8
35+
ret void
36+
}
37+
38+
declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8 immarg)
39+
40+
declare dso_local void @value_create(ptr sret(%struct.Value) align 32)
41+
42+
declare dso_local align 32 ptr @value_set_type(ptr align 32)
43+
44+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
45+
; CHECK-MODIFY-CFG: {{.*}}
46+
; CHECK-PRESERVE-CFG: {{.*}}

llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll

+22-13
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,15 @@ define amdgpu_kernel void @test_memset() #0 {
4646
; CHECK-LABEL: @test_memset(
4747
; CHECK-NEXT: entry:
4848
; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
49-
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
49+
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
5050
; CHECK-NEXT: br label [[BB:%.*]]
5151
; CHECK: bb:
52-
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
53-
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
54-
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
52+
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
53+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
54+
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
55+
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
56+
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
57+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
5558
; CHECK-NEXT: ret void
5659
;
5760
entry:
@@ -217,7 +220,7 @@ define amdgpu_kernel void @test_struct_array_vector_i16() #0 {
217220
; CHECK: bb:
218221
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
219222
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
220-
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
223+
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4_16_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
221224
; CHECK-NEXT: ret void
222225
;
223226
entry:
@@ -285,12 +288,15 @@ define amdgpu_kernel void @test_array_vector() #0 {
285288
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5:%.*]] = alloca <8 x half>, align 16
286289
; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_5]], i8 0, i32 16, i1 false)
287290
; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
288-
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
291+
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
289292
; CHECK-NEXT: br label [[BB:%.*]]
290293
; CHECK: bb:
291-
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
292-
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
293-
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
294+
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
295+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
296+
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
297+
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
298+
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
299+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
294300
; CHECK-NEXT: ret void
295301
;
296302
entry:
@@ -315,12 +321,15 @@ define amdgpu_kernel void @test_array_vector2() #0 {
315321
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5:%.*]] = alloca <8 x half>, align 16
316322
; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_5]], i8 0, i32 16, i1 false)
317323
; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
318-
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
324+
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
319325
; CHECK-NEXT: br label [[BB:%.*]]
320326
; CHECK: bb:
321-
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
322-
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
323-
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
327+
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
328+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
329+
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
330+
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
331+
; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
332+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
324333
; CHECK-NEXT: ret void
325334
;
326335
entry:

0 commit comments

Comments
 (0)