@@ -339,6 +339,7 @@ struct hash_node {
339
339
};
340
340
341
341
struct tensor_alloc {
342
+ int buffer_id ;
342
343
size_t offset ;
343
344
size_t size_max ; // 0 = pre-allocated, unused, or view
344
345
};
@@ -349,7 +350,6 @@ struct leaf_alloc {
349
350
};
350
351
351
352
struct node_alloc {
352
- int buffer_id ;
353
353
struct tensor_alloc dst ;
354
354
struct tensor_alloc src [GGML_MAX_SRC ];
355
355
};
@@ -386,8 +386,19 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
386
386
for (int i = 0 ; i < n_bufs ; i ++ ) {
387
387
galloc -> bufts [i ] = bufts [i ];
388
388
galloc -> buffers [i ] = NULL ;
389
- size_t alignment = ggml_backend_buft_get_alignment (bufts [i ]);
390
- galloc -> buf_tallocs [i ] = ggml_dyn_tallocr_new (alignment );
389
+
390
+ // check if the same buffer type is used multiple times and reuse the same allocator
391
+ for (int j = 0 ; j < i ; j ++ ) {
392
+ if (bufts [i ] == bufts [j ]) {
393
+ galloc -> buf_tallocs [i ] = galloc -> buf_tallocs [j ];
394
+ break ;
395
+ }
396
+ }
397
+
398
+ if (galloc -> buf_tallocs [i ] == NULL ) {
399
+ size_t alignment = ggml_backend_buft_get_alignment (bufts [i ]);
400
+ galloc -> buf_tallocs [i ] = ggml_dyn_tallocr_new (alignment );
401
+ }
391
402
}
392
403
galloc -> n_buffers = n_bufs ;
393
404
@@ -405,10 +416,30 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
405
416
406
417
for (int i = 0 ; i < galloc -> n_buffers ; i ++ ) {
407
418
if (galloc -> buffers != NULL ) {
408
- ggml_backend_buffer_free (galloc -> buffers [i ]);
419
+ // skip if already freed
420
+ bool freed = false;
421
+ for (int j = 0 ; j < i ; j ++ ) {
422
+ if (galloc -> buffers [j ] == galloc -> buffers [i ]) {
423
+ freed = true;
424
+ break ;
425
+ }
426
+ }
427
+ if (!freed ) {
428
+ ggml_backend_buffer_free (galloc -> buffers [i ]);
429
+ }
409
430
}
410
431
if (galloc -> buf_tallocs != NULL ) {
411
- ggml_dyn_tallocr_free (galloc -> buf_tallocs [i ]);
432
+ // skip if already freed
433
+ bool freed = false;
434
+ for (int j = 0 ; j < i ; j ++ ) {
435
+ if (galloc -> buf_tallocs [j ] == galloc -> buf_tallocs [i ]) {
436
+ freed = true;
437
+ break ;
438
+ }
439
+ }
440
+ if (!freed ) {
441
+ ggml_dyn_tallocr_free (galloc -> buf_tallocs [i ]);
442
+ }
412
443
}
413
444
}
414
445
@@ -511,17 +542,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
511
542
}
512
543
}
513
544
514
- static void ggml_gallocr_free_node (ggml_gallocr_t galloc , struct ggml_tensor * node , int buffer_id ) {
545
+ static void ggml_gallocr_free_node (ggml_gallocr_t galloc , struct ggml_tensor * node ) {
515
546
// graph outputs are never freed
516
547
if (node -> flags & GGML_TENSOR_FLAG_OUTPUT ) {
517
548
AT_PRINTF ("not freeing output %s\n" , node -> name );
518
549
return ;
519
550
}
520
551
521
- struct ggml_dyn_tallocr * alloc = galloc -> buf_tallocs [buffer_id ];
522
- ggml_backend_buffer_type_t buft = galloc -> bufts [buffer_id ];
523
552
struct hash_node * hn = ggml_gallocr_hash_get (galloc , node );
524
553
size_t offset = hn -> offset ;
554
+ int buffer_id = hn -> buffer_id ;
555
+ struct ggml_dyn_tallocr * alloc = galloc -> buf_tallocs [buffer_id ];
556
+ ggml_backend_buffer_type_t buft = galloc -> bufts [buffer_id ];
525
557
size_t size = ggml_backend_buft_get_alloc_size (buft , node );
526
558
ggml_dyn_tallocr_free_tensor (alloc , offset , size , node );
527
559
hn -> allocated = false;
@@ -626,11 +658,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
626
658
AT_PRINTF ("view_src %s: %d children, %d views\n" ,
627
659
view_src -> name , view_src_hn -> n_children , view_src_hn -> n_views );
628
660
if (view_src_hn -> n_views == 0 && view_src_hn -> n_children == 0 && view_src_hn -> allocated ) {
629
- ggml_gallocr_free_node (galloc , view_src , buffer_id );
661
+ ggml_gallocr_free_node (galloc , view_src );
630
662
}
631
663
}
632
664
else if (p_hn -> allocated ) {
633
- ggml_gallocr_free_node (galloc , parent , buffer_id );
665
+ ggml_gallocr_free_node (galloc , parent );
634
666
}
635
667
}
636
668
AT_PRINTF ("\n" );
@@ -674,22 +706,25 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
674
706
for (int i = 0 ; i < graph -> n_nodes ; i ++ ) {
675
707
struct ggml_tensor * node = graph -> nodes [i ];
676
708
struct node_alloc * node_alloc = & galloc -> node_allocs [i ];
677
- node_alloc -> buffer_id = get_node_buffer_id (node_buffer_ids , i );
678
709
if (node -> view_src || node -> data ) {
710
+ node_alloc -> dst .buffer_id = -1 ;
679
711
node_alloc -> dst .offset = SIZE_MAX ;
680
712
node_alloc -> dst .size_max = 0 ;
681
713
} else {
682
714
struct hash_node * hn = ggml_gallocr_hash_get (galloc , node );
683
- node_alloc -> dst .offset = hn -> offset ;
684
- node_alloc -> dst .size_max = ggml_backend_buft_get_alloc_size (galloc -> bufts [hn -> buffer_id ], node );
715
+ node_alloc -> dst .buffer_id = hn -> buffer_id ;
716
+ node_alloc -> dst .offset = hn -> offset ;
717
+ node_alloc -> dst .size_max = ggml_backend_buft_get_alloc_size (galloc -> bufts [hn -> buffer_id ], node );
685
718
}
686
719
for (int j = 0 ; j < GGML_MAX_SRC ; j ++ ) {
687
720
struct ggml_tensor * src = node -> src [j ];
688
721
if (!src || src -> view_src || src -> data ) {
722
+ node_alloc -> src [j ].buffer_id = -1 ;
689
723
node_alloc -> src [j ].offset = SIZE_MAX ;
690
724
node_alloc -> src [j ].size_max = 0 ;
691
725
} else {
692
726
struct hash_node * hn = ggml_gallocr_hash_get (galloc , src );
727
+ node_alloc -> src [j ].buffer_id = hn -> buffer_id ;
693
728
node_alloc -> src [j ].offset = hn -> offset ;
694
729
node_alloc -> src [j ].size_max = ggml_backend_buft_get_alloc_size (galloc -> bufts [hn -> buffer_id ], src );
695
730
}
@@ -706,16 +741,26 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
706
741
struct hash_node * hn = ggml_gallocr_hash_get (galloc , leaf );
707
742
galloc -> leaf_allocs [i ].buffer_id = hn -> buffer_id ;
708
743
if (leaf -> view_src || leaf -> data ) {
744
+ galloc -> leaf_allocs [i ].leaf .buffer_id = -1 ;
709
745
galloc -> leaf_allocs [i ].leaf .offset = SIZE_MAX ;
710
746
galloc -> leaf_allocs [i ].leaf .size_max = 0 ;
711
747
} else {
748
+ galloc -> leaf_allocs [i ].leaf .buffer_id = hn -> buffer_id ;
712
749
galloc -> leaf_allocs [i ].leaf .offset = hn -> offset ;
713
750
galloc -> leaf_allocs [i ].leaf .size_max = ggml_backend_buft_get_alloc_size (galloc -> bufts [hn -> buffer_id ], leaf );
714
751
}
715
752
}
716
753
717
754
// reallocate buffers if needed
718
755
for (int i = 0 ; i < galloc -> n_buffers ; i ++ ) {
756
+ // if the buffer type is used multiple times, we reuse the same buffer
757
+ for (int j = 0 ; j < i ; j ++ ) {
758
+ if (galloc -> buf_tallocs [j ] == galloc -> buf_tallocs [i ]) {
759
+ galloc -> buffers [i ] = galloc -> buffers [j ];
760
+ break ;
761
+ }
762
+ }
763
+
719
764
size_t cur_size = galloc -> buffers [i ] ? ggml_backend_buffer_get_size (galloc -> buffers [i ]) : 0 ;
720
765
size_t new_size = ggml_dyn_tallocr_max_size (galloc -> buf_tallocs [i ]);
721
766
@@ -724,6 +769,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
724
769
#ifndef NDEBUG
725
770
fprintf (stderr , "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n" , __func__ , ggml_backend_buft_name (galloc -> bufts [i ]), cur_size / 1024.0 / 1024.0 , new_size / 1024.0 / 1024.0 );
726
771
#endif
772
+
727
773
ggml_backend_buffer_free (galloc -> buffers [i ]);
728
774
galloc -> buffers [i ] = ggml_backend_buft_alloc_buffer (galloc -> bufts [i ], new_size );
729
775
if (galloc -> buffers [i ] == NULL ) {
@@ -740,7 +786,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
740
786
return ggml_gallocr_reserve_n (galloc , graph , NULL , NULL );
741
787
}
742
788
743
- static void ggml_gallocr_init_tensor (ggml_gallocr_t galloc , struct ggml_tensor * tensor , int buffer_id , struct tensor_alloc * tensor_alloc ) {
789
+ static void ggml_gallocr_init_tensor (ggml_gallocr_t galloc , struct ggml_tensor * tensor , struct tensor_alloc * tensor_alloc ) {
790
+ int buffer_id = tensor_alloc -> buffer_id ;
744
791
assert (tensor -> data || tensor -> view_src || ggml_backend_buffer_get_alloc_size (galloc -> buffers [buffer_id ], tensor ) <= tensor_alloc -> size_max );
745
792
746
793
if (tensor -> view_src != NULL ) {
@@ -768,8 +815,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
768
815
}
769
816
}
770
817
771
- static bool ggml_gallocr_node_needs_realloc (ggml_gallocr_t galloc , struct ggml_tensor * node , struct node_alloc * nalloc , struct tensor_alloc * talloc ) {
772
- ggml_backend_buffer_type_t buft = galloc -> bufts [nalloc -> buffer_id ];
818
+ static bool ggml_gallocr_node_needs_realloc (ggml_gallocr_t galloc , struct ggml_tensor * node , struct tensor_alloc * talloc ) {
819
+ ggml_backend_buffer_type_t buft = talloc -> buffer_id != -1 ? galloc -> bufts [talloc -> buffer_id ] : NULL ;
773
820
size_t node_size = (node -> data || node -> view_src ) ? 0 : ggml_backend_buft_get_alloc_size (buft , node );
774
821
return talloc -> size_max >= node_size ;
775
822
}
@@ -793,7 +840,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
793
840
struct ggml_tensor * node = graph -> nodes [i ];
794
841
struct node_alloc * node_alloc = & galloc -> node_allocs [i ];
795
842
796
- if (!ggml_gallocr_node_needs_realloc (galloc , node , node_alloc , & node_alloc -> dst )) {
843
+ if (!ggml_gallocr_node_needs_realloc (galloc , node , & node_alloc -> dst )) {
797
844
#ifndef NDEBUG
798
845
fprintf (stderr , "%s: node %s is not valid\n" , __func__ , node -> name );
799
846
#endif
@@ -805,7 +852,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
805
852
if (src == NULL ) {
806
853
continue ;
807
854
}
808
- if (!ggml_gallocr_node_needs_realloc (galloc , src , node_alloc , & node_alloc -> src [j ])) {
855
+ if (!ggml_gallocr_node_needs_realloc (galloc , src , & node_alloc -> src [j ])) {
809
856
#ifndef NDEBUG
810
857
fprintf (stderr , "%s: src %d (%s) of node %s is not valid\n" , __func__ , j , src -> name , node -> name );
811
858
#endif
@@ -846,7 +893,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
846
893
for (int i = 0 ; i < graph -> n_leafs ; i ++ ) {
847
894
struct ggml_tensor * leaf = graph -> leafs [i ];
848
895
struct leaf_alloc * leaf_alloc = & galloc -> leaf_allocs [i ];
849
- ggml_gallocr_init_tensor (galloc , leaf , leaf_alloc -> buffer_id , & leaf_alloc -> leaf );
896
+ ggml_gallocr_init_tensor (galloc , leaf , & leaf_alloc -> leaf );
850
897
}
851
898
// nodes
852
899
for (int i = 0 ; i < graph -> n_nodes ; i ++ ) {
@@ -857,9 +904,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
857
904
if (src == NULL ) {
858
905
continue ;
859
906
}
860
- ggml_gallocr_init_tensor (galloc , src , node_alloc -> buffer_id , & node_alloc -> src [j ]);
907
+ ggml_gallocr_init_tensor (galloc , src , & node_alloc -> src [j ]);
861
908
}
862
- ggml_gallocr_init_tensor (galloc , node , node_alloc -> buffer_id , & node_alloc -> dst );
909
+ ggml_gallocr_init_tensor (galloc , node , & node_alloc -> dst );
863
910
}
864
911
865
912
return true;
@@ -871,6 +918,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
871
918
if (galloc -> buffers [buffer_id ] == NULL ) {
872
919
return 0 ;
873
920
}
921
+
922
+ for (int i = 0 ; i < buffer_id ; i ++ ) {
923
+ if (galloc -> buffers [i ] == galloc -> buffers [buffer_id ]) {
924
+ // this buffer is the same as a previous one due to the same buffer type being used multiple times
925
+ // only return the buffer size the first time it appears to avoid double counting
926
+ return 0 ;
927
+ }
928
+ }
929
+
874
930
return ggml_backend_buffer_get_size (galloc -> buffers [buffer_id ]);
875
931
}
876
932
0 commit comments