Skip to content

Commit 92472ea

Browse files
committed
cuda : unroll some of the loops
1 parent 1f8a592 commit 92472ea

File tree

1 file changed

+2
-0
lines changed

1 file changed

+2
-0
lines changed

ggml-cuda.cu

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6462,6 +6462,7 @@ static __global__ void flash_attn_ext_f16(
64626462
half16x16_acc lo[Q16][D16];
64636463

64646464
// load heads from Q to shared memory
6465+
#pragma unroll
64656466
for (int j0 = 0; j0 < Q; j0 += num_warps) {
64666467
const int j = j0 + warp_id;
64676468
if (j >= Q) {
@@ -6470,6 +6471,7 @@ static __global__ void flash_attn_ext_f16(
64706471

64716472
const float2 * q2 = (const float2 *) (q + ((iq1 + j)*nb01 + iq2*nb02 + iq3*nb03));
64726473

6474+
#pragma unroll
64736475
for (int i0 = 0; i0 < D2; i0 += NW) {
64746476
const int i = i0 + lane_id;
64756477
if (i >= D2) {

0 commit comments

Comments
 (0)