Skip to content

Commit 104db26

Browse files
authored
[AMDGPU] Fix image intrinsic optimizer on loads from different resources (#69355)
The image intrinsic optimizer pass was neglecting to check any arguments of the load intrinsic after the VAddr arguments. For example multiple loads from different resources should not have been combined but were, because the pass was not checking the resource argument.
1 parent 675231e commit 104db26

File tree

2 files changed

+57
-21
lines changed

2 files changed

+57
-21
lines changed

llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp

Lines changed: 16 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -108,28 +108,23 @@ void addInstToMergeableList(
108108
if (IIList.front()->getType() != II->getType())
109109
continue;
110110

111-
// Check DMask.
112-
Value *DMaskList = IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex);
113-
Value *DMask = II->getArgOperand(ImageDimIntr->DMaskIndex);
114-
if (DMaskList != DMask)
115-
continue;
116-
117-
// Check VAddr (except FragId).
118-
int I = ImageDimIntr->VAddrStart;
119-
for (; I < ImageDimIntr->VAddrEnd - 1; ++I) {
120-
if (IIList.front()->getArgOperand(I) != II->getArgOperand(I))
121-
break;
111+
// Check all arguments (DMask, VAddr, RSrc etc).
112+
bool AllEqual = true;
113+
assert(IIList.front()->arg_size() == II->arg_size());
114+
for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) {
115+
Value *ArgList = IIList.front()->getArgOperand(I);
116+
Value *Arg = II->getArgOperand(I);
117+
if (I == ImageDimIntr->VAddrEnd - 1) {
118+
// Check FragId group.
119+
auto FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(I));
120+
auto FragId = cast<ConstantInt>(II->getArgOperand(I));
121+
AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4);
122+
} else {
123+
// Check all arguments except FragId.
124+
AllEqual = ArgList == Arg;
125+
}
122126
}
123-
124-
if (I != ImageDimIntr->VAddrEnd - 1)
125-
continue;
126-
127-
// Check FragId group.
128-
const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
129-
Value *FragIdList = IIList.front()->getArgOperand(FragIdIndex);
130-
auto IIListFragId = cast<ConstantInt>(FragIdList);
131-
auto IIFragId = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
132-
if (IIListFragId->getValue().udiv(4) != IIFragId->getValue().udiv(4))
127+
if (!AllEqual)
133128
continue;
134129

135130
// Add to the list.

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1184,6 +1184,47 @@ merge:
11841184
ret [4 x float] %i25
11851185
}
11861186

1187+
define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc(<8 x i32> inreg %rsrc1, <8 x i32> inreg %rsrc2, i32 %s, i32 %t) {
1188+
; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc(
1189+
; NO-MSAA-SAME: <8 x i32> inreg [[RSRC1:%.*]], <8 x i32> inreg [[RSRC2:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
1190+
; NO-MSAA-NEXT: main_body:
1191+
; NO-MSAA-NEXT: [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC1]], i32 0, i32 0)
1192+
; NO-MSAA-NEXT: [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC1]], i32 0, i32 0)
1193+
; NO-MSAA-NEXT: [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC2]], i32 0, i32 0)
1194+
; NO-MSAA-NEXT: [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC2]], i32 0, i32 0)
1195+
; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
1196+
; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
1197+
; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
1198+
; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
1199+
; NO-MSAA-NEXT: ret [4 x float] [[I7]]
1200+
;
1201+
; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc(
1202+
; MSAA-SAME: <8 x i32> inreg [[RSRC1:%.*]], <8 x i32> inreg [[RSRC2:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
1203+
; MSAA-NEXT: main_body:
1204+
; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC1]], i32 0, i32 0)
1205+
; MSAA-NEXT: [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
1206+
; MSAA-NEXT: [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
1207+
; MSAA-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC2]], i32 0, i32 0)
1208+
; MSAA-NEXT: [[I2:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
1209+
; MSAA-NEXT: [[I3:%.*]] = extractelement <4 x float> [[TMP1]], i64 1
1210+
; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
1211+
; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
1212+
; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
1213+
; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
1214+
; MSAA-NEXT: ret [4 x float] [[I7]]
1215+
;
1216+
main_body:
1217+
%i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc1, i32 0, i32 0)
1218+
%i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc1, i32 0, i32 0)
1219+
%i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc2, i32 0, i32 0)
1220+
%i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc2, i32 0, i32 0)
1221+
%i4 = insertvalue [4 x float] undef, float %i, 0
1222+
%i5 = insertvalue [4 x float] %i4, float %i1, 1
1223+
%i6 = insertvalue [4 x float] %i5, float %i2, 2
1224+
%i7 = insertvalue [4 x float] %i6, float %i3, 3
1225+
ret [4 x float] %i7
1226+
}
1227+
11871228
declare float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
11881229
declare <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
11891230
declare <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0

0 commit comments

Comments
 (0)