We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 2272765 commit fece1feCopy full SHA for fece1fe
ggml-cuda/fattn.cu
@@ -225,12 +225,11 @@ static __global__ void flash_attn_vec_ext_f16(
225
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
226
}
227
228
- if (parallel_blocks == 1 || tid != 0) {
229
- return;
230
- }
+ if (parallel_blocks != 1 && tid != 0) {
231
#pragma unroll
232
- for (int j = 0; j < ncols; ++j) {
233
- dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
+ for (int j = 0; j < ncols; ++j) {
+ dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
+ }
234
235
#else
236
NO_DEVICE_CODE;
0 commit comments