We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 1f8a592 commit 92472eaCopy full SHA for 92472ea
ggml-cuda.cu
@@ -6462,6 +6462,7 @@ static __global__ void flash_attn_ext_f16(
6462
half16x16_acc lo[Q16][D16];
6463
6464
// load heads from Q to shared memory
6465
+#pragma unroll
6466
for (int j0 = 0; j0 < Q; j0 += num_warps) {
6467
const int j = j0 + warp_id;
6468
if (j >= Q) {
@@ -6470,6 +6471,7 @@ static __global__ void flash_attn_ext_f16(
6470
6471
6472
const float2 * q2 = (const float2 *) (q + ((iq1 + j)*nb01 + iq2*nb02 + iq3*nb03));
6473
6474
6475
for (int i0 = 0; i0 < D2; i0 += NW) {
6476
const int i = i0 + lane_id;
6477
if (i >= D2) {
0 commit comments