diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 9f9ed9db0c588..90abd71d0d330 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -17,11 +17,11 @@ #include float tensor_sum_elements(const ggml_tensor * tensor) { - float sum = 0; - if (tensor->type==GGML_TYPE_F32) { + double sum = 0; + if (tensor->type == GGML_TYPE_F32) { for (int j = 0; j < tensor->ne[1]; j++) { for (int k = 0; k < tensor->ne[0]; k++) { - sum += ((float *) tensor->data)[j*tensor->ne[0]+k]; + sum += ((float *) tensor->data)[j*tensor->ne[0] + k]; } } } @@ -110,12 +110,15 @@ int main(int argc, char ** argv) { //printf("Memsize required = %i\n", sizex*sizex); + // TODO: perform the bench for all types or for a user specified type + const ggml_type qtype = GGML_TYPE_Q4_1; + size_t ctx_size = 0; ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0); - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0); + ctx_size += sizex*sizey*ggml_type_sizef(qtype); + ctx_size += sizex*sizey*ggml_type_sizef(qtype); ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS ctx_size += 1024*1024*16; @@ -148,7 +151,7 @@ int main(int argc, char ** argv) { struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez); ggml_set_f32(m2, 2.0f); - printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n"); + printf("\n------ Test 1 - Matrix Mult via F32 code\n"); // printf("Creating new tensor m11xm2\n"); struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); @@ -165,17 +168,16 @@ int main(int argc, char ** argv) { TENSOR_DUMP(gf.nodes[0]); - printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n"); + printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); int32_t nelements = sizex*sizey; - int32_t ne[2] = { sizex, sizey }; std::vector hist_cur(1 << 4, 0); // Set up a the benchmark matrices // printf("Creating new tensor q11 & Running quantize\n"); - struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); - ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data()); + struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data()); // Set up a the compute graph // printf("Creating new tensor q31\n"); @@ -187,8 +189,8 @@ int main(int argc, char ** argv) { // Set up a second graph computation to make sure we override the CPU cache lines // printf("Creating new tensor q12 & Running quantize\n"); - struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); - ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data()); + struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data()); // printf("Creating new tensor q32\n"); struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); @@ -206,7 +208,7 @@ int main(int argc, char ** argv) { printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); - // Let's use the F32 result from above as a reference for the q4_0 multiplication + // Let's use the F32 result from above as a reference for the quantized multiplication float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]); printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");