-
Notifications
You must be signed in to change notification settings - Fork 15.6k
Description
I have observed that (on gfx1100, but I suspect this may affect other chips), a sufficiently long sequence of loads will cause the backend (on or around current main - 0729a74 and 4204244 have this bug) to emit more than 63 loads without issuing a waitcnt to bring the number of loads down below the hardware limit.
Overflowing VMCNT (which on gfx11 and other architectures, like gfx9, is stored as 6 bits) appears to have undefined behavior. In practice, this has led to unstable numerical results in a ML workload's matrix-vector multiplication where stale values were read because a waitcnt was passed before a value returned ... sometimes.
The full compilation for input.ll (attached below) was done with
llc -O3 -mtriple=amdgcn-amd-mdhsa -mcpu=gfx1100 -mattr='+wavefrontsize32' input.llThen, I reduced the testcase down to the following input (looking at the debug logs from si-insert-waitcnts to gauge interestingness) which is also attached as reduced.ll
Reduced LLVM IR
target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
target triple = "amdgcn-amd-amdhsa"
define amdgpu_kernel void @"main$async_dispatch_28_matmul_like_Dx1024x14x14x512_f32"(ptr addrspace(1) %invariant.gep, ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, ptr addrspace(1) %10, ptr addrspace(1) %11, ptr addrspace(1) %12, ptr addrspace(1) %13, ptr addrspace(1) %14, ptr addrspace(1) %15, ptr addrspace(1) %16, ptr addrspace(1) %17, ptr addrspace(1) %18, ptr addrspace(1) %19, ptr addrspace(1) %20, ptr addrspace(1) %21, ptr addrspace(1) %22, ptr addrspace(1) %23, ptr addrspace(1) %24, ptr addrspace(1) %25, ptr addrspace(1) %26, ptr addrspace(1) %27, ptr addrspace(1) %28, ptr addrspace(1) %29, ptr addrspace(1) %30, ptr addrspace(1) %31, ptr addrspace(1) %32, ptr addrspace(1) %33, ptr addrspace(1) %34, ptr addrspace(1) %35, ptr addrspace(1) %36, ptr addrspace(1) %37, ptr addrspace(1) %38, ptr addrspace(1) %39, ptr addrspace(1) %40, ptr addrspace(1) %41, ptr addrspace(1) %42, ptr addrspace(1) %43, ptr addrspace(1) %44, ptr addrspace(1) %45, ptr addrspace(1) %46, ptr addrspace(1) %47, ptr addrspace(1) %48, ptr addrspace(1) %49, ptr addrspace(7) %50) {
.lr.ph197:
br label %.lr.ph
.lr.ph: ; preds = %.lr.ph, %.lr.ph197
%51 = load float, ptr addrspace(1) %27, align 8
%52 = load float, ptr addrspace(1) inttoptr (i64 80 to ptr addrspace(1)), align 16
%53 = load float, ptr addrspace(1) inttoptr (i64 96 to ptr addrspace(1)), align 32
%54 = insertelement <14 x float> zeroinitializer, float %53, i64 0
%55 = load float, ptr addrspace(1) inttoptr (i64 88 to ptr addrspace(1)), align 8
%56 = insertelement <14 x float> %54, float %55, i64 1
%57 = load float, ptr addrspace(1) null, align 8
%58 = insertelement <14 x float> %56, float %57, i64 9
%59 = load float, ptr addrspace(1) %43, align 32
%60 = insertelement <14 x float> zeroinitializer, float %59, i64 0
%61 = load float, ptr addrspace(1) %44, align 8
%62 = insertelement <14 x float> %60, float %61, i64 1
%63 = load float, ptr addrspace(1) %45, align 16
%64 = insertelement <14 x float> %62, float %63, i64 2
%65 = load float, ptr addrspace(1) %46, align 8
%66 = insertelement <14 x float> %64, float %65, i64 3
%67 = getelementptr i8, ptr addrspace(1) %43, i64 32
%68 = load float, ptr addrspace(1) %67, align 32
%69 = insertelement <14 x float> %66, float %68, i64 4
%70 = load float, ptr addrspace(1) %47, align 8
%71 = insertelement <14 x float> %69, float %70, i64 5
%72 = getelementptr i8, ptr addrspace(1) %43, i64 48
%73 = load float, ptr addrspace(1) %72, align 16
%74 = insertelement <14 x float> %71, float %73, i64 6
%75 = getelementptr i8, ptr addrspace(1) %43, i64 56
%76 = load float, ptr addrspace(1) %75, align 8
%77 = insertelement <14 x float> %74, float %76, i64 7
%78 = load float, ptr addrspace(1) %13, align 32
%79 = insertelement <14 x float> %77, float %78, i64 8
%80 = load float, ptr addrspace(1) %49, align 8
%81 = load float, ptr addrspace(1) %28, align 32
%82 = insertelement <14 x float> zeroinitializer, float %81, i64 0
%83 = load float, ptr addrspace(1) %29, align 8
%84 = insertelement <14 x float> %82, float %83, i64 1
%85 = load float, ptr addrspace(1) %30, align 16
%86 = insertelement <14 x float> %84, float %85, i64 2
%87 = load float, ptr addrspace(1) %31, align 8
%88 = insertelement <14 x float> %86, float %87, i64 3
%89 = load float, ptr addrspace(1) %32, align 32
%90 = insertelement <14 x float> %88, float %89, i64 4
%91 = load float, ptr addrspace(1) %33, align 8
%92 = insertelement <14 x float> %90, float %91, i64 5
%93 = load float, ptr addrspace(1) %34, align 16
%94 = insertelement <14 x float> %92, float %93, i64 6
%95 = load float, ptr addrspace(1) %35, align 8
%96 = insertelement <14 x float> %94, float %95, i64 7
%97 = load float, ptr addrspace(1) %36, align 32
%98 = insertelement <14 x float> %96, float %97, i64 8
%99 = load float, ptr addrspace(1) %37, align 8
%100 = insertelement <14 x float> %98, float %99, i64 9
%101 = load float, ptr addrspace(1) %38, align 16
%102 = insertelement <14 x float> %100, float %101, i64 10
%103 = load float, ptr addrspace(1) %39, align 8
%104 = insertelement <14 x float> %102, float %103, i64 11
%105 = load float, ptr addrspace(1) %40, align 32
%106 = insertelement <14 x float> %104, float %105, i64 12
%107 = load float, ptr addrspace(1) %41, align 8
%108 = insertelement <14 x float> %106, float %107, i64 13
%109 = load float, ptr addrspace(1) %14, align 32
%110 = insertelement <14 x float> zeroinitializer, float %109, i64 0
%111 = load float, ptr addrspace(1) %15, align 8
%112 = insertelement <14 x float> %110, float %111, i64 1
%113 = load float, ptr addrspace(1) %16, align 16
%114 = insertelement <14 x float> %112, float %113, i64 2
%115 = load float, ptr addrspace(1) %17, align 8
%116 = insertelement <14 x float> %114, float %115, i64 3
%117 = load float, ptr addrspace(1) %18, align 32
%118 = insertelement <14 x float> %116, float %117, i64 4
%119 = load float, ptr addrspace(1) %19, align 8
%120 = insertelement <14 x float> %118, float %119, i64 5
%121 = load float, ptr addrspace(1) %20, align 16
%122 = insertelement <14 x float> %120, float %121, i64 6
%123 = load float, ptr addrspace(1) %21, align 8
%124 = insertelement <14 x float> %122, float %123, i64 7
%125 = load float, ptr addrspace(1) %22, align 32
%126 = insertelement <14 x float> %124, float %125, i64 8
%127 = load float, ptr addrspace(1) %23, align 8
%128 = insertelement <14 x float> %126, float %127, i64 9
%129 = load float, ptr addrspace(1) %24, align 16
%130 = insertelement <14 x float> %128, float %129, i64 10
%131 = load float, ptr addrspace(1) %25, align 8
%132 = insertelement <14 x float> %130, float %131, i64 11
%133 = load float, ptr addrspace(1) %26, align 32
%134 = insertelement <14 x float> %132, float %133, i64 12
%135 = load float, ptr addrspace(1) %invariant.gep, align 32
%136 = insertelement <14 x float> zeroinitializer, float %135, i64 0
%137 = load float, ptr addrspace(1) %0, align 8
%138 = insertelement <14 x float> %136, float %137, i64 1
%139 = load float, ptr addrspace(1) %1, align 16
%140 = insertelement <14 x float> %138, float %139, i64 2
%141 = load float, ptr addrspace(1) %2, align 8
%142 = insertelement <14 x float> %140, float %141, i64 3
%143 = load float, ptr addrspace(1) %3, align 32
%144 = insertelement <14 x float> %142, float %143, i64 4
%145 = load float, ptr addrspace(1) %4, align 8
%146 = insertelement <14 x float> %144, float %145, i64 5
%147 = load float, ptr addrspace(1) %5, align 16
%148 = insertelement <14 x float> %146, float %147, i64 6
%149 = load float, ptr addrspace(1) %6, align 8
%150 = insertelement <14 x float> %148, float %149, i64 7
%151 = load float, ptr addrspace(1) %7, align 32
%152 = insertelement <14 x float> %150, float %151, i64 8
%153 = load float, ptr addrspace(1) %8, align 8
%154 = load <32 x float>, ptr addrspace(7) %50, align 4
%155 = shufflevector <32 x float> %154, <32 x float> zeroinitializer, <14 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
%156 = shufflevector <32 x float> %154, <32 x float> zeroinitializer, <14 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
%157 = shufflevector <32 x float> %154, <32 x float> zeroinitializer, <14 x i32> <i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13>
%158 = tail call <14 x float> @llvm.fma.v14f32(<14 x float> %156, <14 x float> zeroinitializer, <14 x float> %157)
%159 = insertelement <14 x float> %79, float %80, i64 9
%160 = load float, ptr addrspace(1) %42, align 16
%161 = insertelement <14 x float> %159, float %160, i64 10
%162 = load float, ptr addrspace(1) %48, align 8
%163 = insertelement <14 x float> %161, float %162, i64 11
%164 = getelementptr i8, ptr addrspace(1) %43, i64 96
%165 = load float, ptr addrspace(1) %164, align 32
%166 = insertelement <14 x float> %163, float %165, i64 12
%167 = getelementptr i8, ptr addrspace(1) %43, i64 104
%168 = load float, ptr addrspace(1) %167, align 8
%169 = insertelement <14 x float> %166, float %168, i64 13
%170 = shufflevector <32 x float> %154, <32 x float> zeroinitializer, <14 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%171 = insertelement <14 x float> %58, float %52, i64 10
%172 = tail call <14 x float> @llvm.fma.v14f32(<14 x float> %155, <14 x float> zeroinitializer, <14 x float> %158)
%173 = tail call <14 x float> @llvm.fma.v14f32(<14 x float> %171, <14 x float> zeroinitializer, <14 x float> %172)
%174 = tail call <14 x float> @llvm.fma.v14f32(<14 x float> %169, <14 x float> %170, <14 x float> %173)
%175 = insertelement <14 x float> %152, float %153, i64 9
%176 = load float, ptr addrspace(1) %9, align 16
%177 = insertelement <14 x float> %175, float %176, i64 10
%178 = load float, ptr addrspace(1) %10, align 8
%179 = insertelement <14 x float> %177, float %178, i64 11
%180 = load float, ptr addrspace(1) %11, align 32
%181 = insertelement <14 x float> %179, float %180, i64 12
%182 = load float, ptr addrspace(1) %12, align 8
%183 = insertelement <14 x float> %181, float %182, i64 13
%184 = insertelement <14 x float> %134, float %51, i64 13
%185 = tail call <14 x float> @llvm.fma.v14f32(<14 x float> %108, <14 x float> zeroinitializer, <14 x float> %174)
%186 = tail call <14 x float> @llvm.fma.v14f32(<14 x float> %184, <14 x float> zeroinitializer, <14 x float> %185)
%187 = tail call <14 x float> @llvm.fma.v14f32(<14 x float> %183, <14 x float> zeroinitializer, <14 x float> %186)
store <14 x float> %187, ptr addrspace(1) null, align 8
br label %.lr.ph
}
; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
declare <14 x float> @llvm.fma.v14f32(<14 x float>, <14 x float>, <14 x float>) #0
; uselistorder directives
uselistorder ptr @llvm.fma.v14f32, { 6, 5, 4, 3, 2, 1, 0 }
attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }When this is compiled with
llc -o reduced.s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 reduced.llthe resulting assembly - in relevant part - is as follows
Excerpt of generated assembly
s_clause 0x3
buffer_load_b128 v[19:22], v15, s[100:103], 0 offen offset:32
buffer_load_b128 v[27:30], v15, s[100:103], 0 offen
buffer_load_b128 v[18:21], v15, s[100:103], 0 offen offset:48
buffer_load_b128 v[23:26], v15, s[100:103], 0 offen offset:16
global_load_b32 v16, v[0:1], off
global_load_b32 v17, v[6:7], off
global_load_b32 v20, v[4:5], off
global_load_b32 v23, v[2:3], off
s_waitcnt lgkmcnt(0)
global_load_b32 v24, v14, s[8:9] offset:32
v_readlane_b32 s100, v82, 4
v_readlane_b32 s101, v82, 5
v_readlane_b32 s102, v82, 6
v_readlane_b32 s103, v82, 7
s_clause 0x1f
global_load_b32 v25, v14, s[100:101]
global_load_b32 v27, v14, s[8:9] offset:48
global_load_b32 v28, v14, s[8:9] offset:56
global_load_b32 v29, v14, s[12:13]
global_load_b32 v31, v14, s[14:15]
global_load_b32 v32, v14, s[102:103]
global_load_b32 v33, v14, s[8:9] offset:96
global_load_b32 v34, v14, s[8:9] offset:104
global_load_b32 v35, v14, s[48:49]
global_load_b32 v36, v14, s[6:7]
global_load_b32 v37, v14, s[34:35]
global_load_b32 v38, v14, s[8:9]
global_load_b32 v39, v14, s[10:11]
global_load_b32 v40, v14, s[0:1]
global_load_b32 v41, v14, s[98:99]
global_load_b32 v42, v14, s[96:97]
global_load_b32 v43, v14, s[94:95]
global_load_b32 v44, v14, s[4:5]
global_load_b32 v45, v14, s[2:3]
global_load_b32 v46, v14, s[84:85]
global_load_b32 v47, v14, s[82:83]
global_load_b32 v48, v14, s[80:81]
global_load_b32 v49, v14, s[86:87]
global_load_b32 v50, v14, s[88:89]
global_load_b32 v51, v14, s[90:91]
global_load_b32 v52, v14, s[92:93]
global_load_b32 v53, v14, s[78:79]
global_load_b32 v54, v14, s[58:59]
global_load_b32 v55, v14, s[60:61]
global_load_b32 v56, v14, s[62:63]
global_load_b32 v57, v14, s[64:65]
global_load_b32 v58, v14, s[50:51]
s_clause 0x16
global_load_b32 v59, v14, s[52:53]
global_load_b32 v60, v14, s[54:55]
global_load_b32 v61, v14, s[56:57]
global_load_b32 v62, v14, s[74:75]
global_load_b32 v63, v14, s[72:73]
global_load_b32 v64, v14, s[70:71]
global_load_b32 v65, v14, s[68:69]
global_load_b32 v66, v14, s[76:77]
global_load_b32 v67, v14, s[66:67]
global_load_b32 v68, v14, s[42:43]
global_load_b32 v69, v14, s[40:41]
global_load_b32 v70, v14, s[38:39]
global_load_b32 v71, v14, s[46:47]
global_load_b32 v72, v14, s[22:23]
global_load_b32 v73, v14, s[20:21]
global_load_b32 v74, v14, s[18:19]
global_load_b32 v75, v14, s[30:31]
global_load_b32 v76, v14, s[28:29]
global_load_b32 v77, v14, s[26:27]
global_load_b32 v78, v14, s[24:25]
global_load_b32 v79, v14, s[16:17]
global_load_b32 v80, v14, s[44:45]
global_load_b32 v81, v14, s[36:37]
s_waitcnt vmcnt(61)
v_fmac_f32_e32 v19, 0, v22
s_waitcnt vmcnt(60)Observe that if we add up the clause lengths (noting the instruction's argument is length - 1) we get 32 + 17 + 4 = 59 loads, and then if we include the 5 unclaused global_loads after the buffer_loads, we get 64 total loads that could be outstanding by the time we reach that s_waitcnt vmcnt(61).
While the risk of overflow is theoretical in this case, the unreduced input can reach as many as 128 pending loads, which has caused intermittent correctness issues. This 128 (or 64) outstanding loads state has been confirmed by checking si-insert-waitcnts debug logging and observing both a state where there 128 registers waiting and no mention of action taken to correct this error.
(For high-level context / attempting to trigger the runtime issue, see iree-org/iree#22649 )
Files of interest
- input.ll.txt - the un-reduced input
- interesting.sh.txt - my
llvm-reduceinterestingness test (warning, it took a while to grind out the reduction) - reduced.ll.txt - a cleaner example of the bug
- reduced.s.txt - the full compiler output for this testcase