@@ -33,11 +33,11 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
33
33
}
34
34
35
35
float tensor_sum_elements (const ggml_tensor * tensor) {
36
- float sum = 0 ;
37
- if (tensor->type == GGML_TYPE_F32) {
36
+ double sum = 0 ;
37
+ if (tensor->type == GGML_TYPE_F32) {
38
38
for (int j = 0 ; j < tensor->ne [1 ]; j++) {
39
39
for (int k = 0 ; k < tensor->ne [0 ]; k++) {
40
- sum += ((float *) tensor->data )[j*tensor->ne [0 ]+ k];
40
+ sum += ((float *) tensor->data )[j*tensor->ne [0 ] + k];
41
41
}
42
42
}
43
43
}
@@ -126,12 +126,15 @@ int main(int argc, char ** argv) {
126
126
127
127
// printf("Memsize required = %i\n", sizex*sizex);
128
128
129
+ // TODO: perform the bench for all types or for a user specified type
130
+ const ggml_type qtype = GGML_TYPE_Q4_1;
131
+
129
132
size_t ctx_size = 0 ;
130
133
ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32);
131
134
ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32);
132
135
ctx_size += sizex*sizez*ggml_type_sizef (GGML_TYPE_F32);
133
- ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_Q4_0 );
134
- ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_Q4_0 );
136
+ ctx_size += sizex*sizey*ggml_type_sizef (qtype );
137
+ ctx_size += sizex*sizey*ggml_type_sizef (qtype );
135
138
ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32); // BLAS
136
139
ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32); // BLAS
137
140
ctx_size += 1024 *1024 *16 ;
@@ -164,7 +167,7 @@ int main(int argc, char ** argv) {
164
167
struct ggml_tensor * m2 = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, sizex, sizez);
165
168
ggml_set_f32 (m2, 2 .0f );
166
169
167
- printf (" \n ------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------ \n " );
170
+ printf (" \n ------ Test 1 - Matrix Mult via F32 code\n " );
168
171
// printf("Creating new tensor m11xm2\n");
169
172
struct ggml_tensor * m11xm2 = ggml_mul_mat (ctx, m11, m2);
170
173
@@ -182,17 +185,16 @@ int main(int argc, char ** argv) {
182
185
183
186
TENSOR_DUMP (gf.nodes [0 ]);
184
187
185
- printf (" \n ------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------ \n " );
188
+ printf (" \n ------ Test 2 - Matrix Mult via %s code\n " , ggml_type_name (qtype) );
186
189
187
190
int32_t nelements = sizex*sizey;
188
- int32_t ne[2 ] = { sizex, sizey };
189
191
190
192
std::vector<int64_t > hist_cur (1 << 4 , 0 );
191
193
192
194
// Set up a the benchmark matrices
193
195
// printf("Creating new tensor q11 & Running quantize\n");
194
- struct ggml_tensor * q11 = ggml_new_tensor_2d (ctx, GGML_TYPE_Q4_0 , sizex, sizey);
195
- ggml_quantize_q4_0 ( (const float *) m11->data , q11->data , nelements, ne[ 0 ] , hist_cur.data ());
196
+ struct ggml_tensor * q11 = ggml_new_tensor_2d (ctx, qtype , sizex, sizey);
197
+ ggml_quantize_chunk (qtype, (const float *) m11->data , q11->data , 0 , nelements , hist_cur.data ());
196
198
197
199
// Set up a the compute graph
198
200
// printf("Creating new tensor q31\n");
@@ -203,8 +205,8 @@ int main(int argc, char ** argv) {
203
205
204
206
// Set up a second graph computation to make sure we override the CPU cache lines
205
207
// printf("Creating new tensor q12 & Running quantize\n");
206
- struct ggml_tensor * q12 = ggml_new_tensor_2d (ctx, GGML_TYPE_Q4_0 , sizex, sizey);
207
- ggml_quantize_q4_0 ( (const float *) m12->data , q12->data , nelements, ne[ 0 ] , hist_cur.data ());
208
+ struct ggml_tensor * q12 = ggml_new_tensor_2d (ctx, qtype , sizex, sizey);
209
+ ggml_quantize_chunk (qtype, (const float *) m12->data , q12->data , 0 , nelements , hist_cur.data ());
208
210
209
211
// printf("Creating new tensor q32\n");
210
212
struct ggml_tensor * q32 = ggml_mul_mat (ctx, q12, m2);
@@ -221,7 +223,7 @@ int main(int argc, char ** argv) {
221
223
printf (" Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n " , sizex, sizey, 1 , sizex, sizez, 1 , 1 .0f *flops_per_matrix / 1000 / 1000 / 1000 );
222
224
223
225
224
- // Let's use the F32 result from above as a reference for the q4_0 multiplication
226
+ // Let's use the F32 result from above as a reference for the quantized multiplication
225
227
float sum_of_F32_reference = tensor_sum_elements (gf.nodes [0 ]);
226
228
227
229
printf (" Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n " );
0 commit comments