Skip to content

ggml : mul mat tweaks #2372

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Aug 7, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 79 additions & 55 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -10510,71 +10510,95 @@ static void ggml_compute_forward_mul_mat(
return;
}

// parallelize by src0 rows
const int64_t dr = (ne01 + nth - 1)/nth;
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];

const int64_t ir10 = dr*ith;
const int64_t ir11 = MIN(ir10 + dr, ne01);
const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = ne11*ne12*ne13; // src1 rows

// src1 rows
const int64_t nr1 = ne11*ne12*ne13;
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);

const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
// distribute the thread work across the inner or outer loop based on which one is larger

for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
const int64_t i13 = (ir1/(ne12*ne11));
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);

const int64_t ir0 = (ir1/ne11)%(ne02*ne03);
const int64_t i03 = (ir0/(ne02));
// Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
// See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
// GG: this is likely the correct way to broadcast, though need some more thought
// therefore leaving the comments to remind us for now
const int64_t i02 = (i12 / (ne12 / ne02));
// Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
// const int64_t i02 = (ir0 - i03*ne02);

const int64_t i1 = i11;
const int64_t i2 = i12;
const int64_t i3 = i13;

const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );

// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
// the original src1 data pointer, so we should index using the indices directly
// TODO: this is a bit of a hack, we should probably have a better way to handle this
const char * src1_col = (const char *) wdata +
(src1_cont || src1->type != vec_dot_type
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
: (i11*nb11 + i12*nb12 + i13*nb13));

float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));

for (int64_t ir = ir10; ir < ir11; ++ir) {
vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
}
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows

const int64_t ith0 = ith % nth0;
const int64_t ith1 = ith / nth0;

const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;

const int64_t ir010 = dr0*ith0;
const int64_t ir011 = MIN(ir010 + dr0, nr0);

const int64_t ir110 = dr1*ith1;
const int64_t ir111 = MIN(ir110 + dr1, nr1);

//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);

// threads with no work simply yield (not sure if it helps)
if (ir010 >= ir011 || ir110 >= ir111) {
sched_yield();
return;
}

//int64_t t1 = ggml_time_us();
//static int64_t acc = 0;
//acc += t1 - t0;
//if (t1 - t0 > 10) {
// printf("\n");
// printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
// printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
// printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
assert(ne12 % ne02 == 0);
assert(ne13 % ne03 == 0);

// printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
//}
}
// broadcast factors
const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03;

// block-tiling attempt
const int64_t blck_0 = 16;
const int64_t blck_1 = 16;

// ggml_compute_forward_out_prod
// attempt to reduce false-sharing (does not seem to make a difference)
float tmp[16];

for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
const int64_t i13 = (ir1/(ne12*ne11));
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);

// broadcast src0 into src1
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;

const int64_t i1 = i11;
const int64_t i2 = i12;
const int64_t i3 = i13;

const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);

// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
// the original src1 data pointer, so we should index using the indices directly
// TODO: this is a bit of a hack, we should probably have a better way to handle this
const char * src1_col = (const char *) wdata +
(src1_cont || src1->type != vec_dot_type
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
: (i11*nb11 + i12*nb12 + i13*nb13));

float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));

//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
//}

for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I know vec_dot is cache unfriendly for gemm. IIRC you need a different loop order to reach high FLOPs.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Open to suggestions for alternative implementation.

I think what I achieve with these changes is to to put a few rows from both tensors in L2 cache and do the full dot products. Maybe there is something smarter to utilize the L1 cache better?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is the best I came up for now without trying to rebuild OpenBLAS.

}
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
}
}
}
}

// ggml_compute_forward_out_prod

static void ggml_compute_forward_out_prod_f32(
const struct ggml_compute_params * params,
Expand Down