[AMDGPU] Fix image intrinsic optimizer on loads from different resources (#69355)

jayfoad · web-flow · commit 104db26004d0 · 2023-10-18T11:08:01.000+01:00
The image intrinsic optimizer pass was neglecting to check any arguments
of the load intrinsic after the VAddr arguments. For example multiple
loads from different resources should not have been combined but were,
because the pass was not checking the resource argument.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
@@ -108,28 +108,23 @@ void addInstToMergeableList(
     if (IIList.front()->getType() != II->getType())
       continue;
 
-    // Check DMask.
-    Value *DMaskList = IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex);
-    Value *DMask = II->getArgOperand(ImageDimIntr->DMaskIndex);
-    if (DMaskList != DMask)
-      continue;
-
-    // Check VAddr (except FragId).
-    int I = ImageDimIntr->VAddrStart;
-    for (; I < ImageDimIntr->VAddrEnd - 1; ++I) {
-      if (IIList.front()->getArgOperand(I) != II->getArgOperand(I))
-        break;
+    // Check all arguments (DMask, VAddr, RSrc etc).
+    bool AllEqual = true;
+    assert(IIList.front()->arg_size() == II->arg_size());
+    for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) {
+      Value *ArgList = IIList.front()->getArgOperand(I);
+      Value *Arg = II->getArgOperand(I);
+      if (I == ImageDimIntr->VAddrEnd - 1) {
+        // Check FragId group.
+        auto FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(I));
+        auto FragId = cast<ConstantInt>(II->getArgOperand(I));
+        AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4);
+      } else {
+        // Check all arguments except FragId.
+        AllEqual = ArgList == Arg;
+      }
     }
-
-    if (I != ImageDimIntr->VAddrEnd - 1)
-      continue;
-
-    // Check FragId group.
-    const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
-    Value *FragIdList = IIList.front()->getArgOperand(FragIdIndex);
-    auto IIListFragId = cast<ConstantInt>(FragIdList);
-    auto IIFragId = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
-    if (IIListFragId->getValue().udiv(4) != IIFragId->getValue().udiv(4))
+    if (!AllEqual)
       continue;
 
     // Add to the list.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll
@@ -1184,6 +1184,47 @@ merge:
   ret [4 x float] %i25
 }
 
+define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc(<8 x i32> inreg %rsrc1, <8 x i32> inreg %rsrc2, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC1:%.*]], <8 x i32> inreg [[RSRC2:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT:  main_body:
+; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC1]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC1]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC2]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC2]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; NO-MSAA-NEXT:    [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; NO-MSAA-NEXT:    ret [4 x float] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc(
+; MSAA-SAME: <8 x i32> inreg [[RSRC1:%.*]], <8 x i32> inreg [[RSRC2:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT:  main_body:
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC1]], i32 0, i32 0)
+; MSAA-NEXT:    [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT:    [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC2]], i32 0, i32 0)
+; MSAA-NEXT:    [[I2:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; MSAA-NEXT:    [[I3:%.*]] = extractelement <4 x float> [[TMP1]], i64 1
+; MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; MSAA-NEXT:    [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; MSAA-NEXT:    ret [4 x float] [[I7]]
+;
+main_body:
+  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc1, i32 0, i32 0)
+  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc1, i32 0, i32 0)
+  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc2, i32 0, i32 0)
+  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc2, i32 0, i32 0)
+  %i4 = insertvalue [4 x float] undef, float %i, 0
+  %i5 = insertvalue [4 x float] %i4, float %i1, 1
+  %i6 = insertvalue [4 x float] %i5, float %i2, 2
+  %i7 = insertvalue [4 x float] %i6, float %i3, 3
+  ret [4 x float] %i7
+}
+
 declare float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
 declare <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
 declare <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0