From 768ecfcc280711b6461f31035a975433851aba46 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 26 Jul 2023 17:13:58 +0200 Subject: [PATCH 01/13] ggml : add graph tensor allocator --- CMakeLists.txt | 2 + Makefile | 7 +- ggml-alloc.c | 488 +++++++++++++++++++++++++++++++++++++++++++++++++ ggml-alloc.h | 21 +++ ggml.c | 14 ++ ggml.h | 19 +- llama.cpp | 198 ++++++++++++++++---- 7 files changed, 707 insertions(+), 42 deletions(-) create mode 100644 ggml-alloc.c create mode 100644 ggml-alloc.h diff --git a/CMakeLists.txt b/CMakeLists.txt index c43e65e746320..addbd0f9b7e27 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -497,6 +497,8 @@ endif() add_library(ggml OBJECT ggml.c ggml.h + ggml-alloc.c + ggml-alloc.h ${GGML_SOURCES_CUDA} ${GGML_SOURCES_OPENCL} ${GGML_SOURCES_METAL} diff --git a/Makefile b/Makefile index 2035c525338a7..eb167398a2362 100644 --- a/Makefile +++ b/Makefile @@ -318,7 +318,12 @@ $(info ) ggml.o: ggml.c ggml.h ggml-cuda.h $(CC) $(CFLAGS) -c $< -o $@ -llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h +ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h + $(CC) $(CFLAGS) -c $< -o $@ + +OBJS += ggml-alloc.o + +llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h $(CXX) $(CXXFLAGS) -c $< -o $@ common.o: examples/common.cpp examples/common.h diff --git a/ggml-alloc.c b/ggml-alloc.c new file mode 100644 index 0000000000000..20ab23cf78e9d --- /dev/null +++ b/ggml-alloc.c @@ -0,0 +1,488 @@ +#include "ggml-alloc.h" +#include "ggml.h" +#include +#include +#include +#include +#include + +#define UNUSED(x) (void)(x) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +//#define GGML_ALLOCATOR_DEBUG + +//#define AT_PRINTF printf +#define AT_PRINTF(...) ((void)0) + + +// TODO: GGML_PAD ? +static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { + assert(alignment && !(alignment & (alignment - 1))); // power of 2 + size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment; + return offset + align; +} + +struct free_block { + void * addr; + size_t size; +}; + +#define MAX_FREE_BLOCKS 128 + +struct ggml_allocator { + void * data; + size_t size; + size_t alignment; + int n_free_blocks; + struct free_block free_blocks[MAX_FREE_BLOCKS]; + size_t max_size; + bool measure; + +#ifdef GGML_ALLOCATOR_DEBUG + struct ggml_tensor * allocated_tensors[1024]; +#endif +}; + +#ifdef GGML_ALLOCATOR_DEBUG +static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) { + for (int i = 0; i < 1024; i++) { + if (alloc->allocated_tensors[i] == NULL) { + alloc->allocated_tensors[i] = tensor; + return; + } + } + GGML_ASSERT(!"out of allocated_tensors"); +} +static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) { + for (int i = 0; i < 1024; i++) { + if (alloc->allocated_tensors[i] == tensor || + (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) { + alloc->allocated_tensors[i] = NULL; + return; + } + } + printf("tried to free tensor %s not found\n", tensor->name); + GGML_ASSERT(!"tensor not found"); +} +#endif + + +static size_t ggml_allocator_get_alloc_size(struct ggml_allocator * alloc, struct ggml_tensor * tensor) { + return ggml_nbytes(tensor); + + UNUSED(alloc); +} + +void ggml_allocator_alloc_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) { + size_t size = ggml_allocator_get_alloc_size(alloc, tensor); + size = aligned_offset(NULL, size, alloc->alignment); + + AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); + + size_t max_avail = 0; + + // find the best fitting free block + int best_fit_block = -1; + size_t best_fit_size = SIZE_MAX; + for (int i = 0; i < alloc->n_free_blocks; i++) { + struct free_block * block = &alloc->free_blocks[i]; + max_avail = MAX(max_avail, block->size); + if (block->size >= size && block->size <= best_fit_size) { + best_fit_block = i; + best_fit_size = block->size; + } + } + + AT_PRINTF("block %d\n", best_fit_block); + + if (best_fit_block == -1) { + fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", + __func__, size, max_avail); + GGML_ASSERT(!"not enough space in the buffer"); + return; + } + struct free_block * block = &alloc->free_blocks[best_fit_block]; + void * addr = block->addr; + block->addr = (char*)block->addr + size; + block->size -= size; + if (block->size == 0) { + // remove block if empty + alloc->n_free_blocks--; + for (int j = best_fit_block; j < alloc->n_free_blocks; j++) { + alloc->free_blocks[j] = alloc->free_blocks[j+1]; + } + } + + tensor->data = addr; + +#ifdef GGML_ALLOCATOR_DEBUG + add_allocated_tensor(alloc, tensor); + size_t cur_max = (char*)addr - (char*)alloc->data + size; + if (cur_max > alloc->max_size) { + printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); + for (int i = 0; i < 1024; i++) { + if (alloc->allocated_tensors[i]) { + printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0); + } + } + printf("\n"); + } +#endif + + alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size); +} + +// this is a very naive implementation, but for our case the number of free blocks should be very small +static void ggml_allocator_free_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) { + void * ptr = tensor->data; + + if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) { + // the tensor was not allocated in this buffer + // this can happen because the graph allocator will try to free weights and other tensors from different buffers + // the easiest way to deal with this is just to ignore it + return; + } + + size_t size = ggml_allocator_get_alloc_size(alloc, tensor); + size = aligned_offset(NULL, size, alloc->alignment); + AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks); + +#ifdef GGML_ALLOCATOR_DEBUG + remove_allocated_tensor(alloc, tensor); +#endif + + // see if we can merge with an existing block + for (int i = 0; i < alloc->n_free_blocks; i++) { + struct free_block * block = &alloc->free_blocks[i]; + // check if ptr is at the end of the block + if ((char*)block->addr + block->size == ptr) { + block->size += size; + // check if we can merge with the next block + if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) { + block->size += alloc->free_blocks[i+1].size; + alloc->n_free_blocks--; + for (int j = i+1; j < alloc->n_free_blocks; j++) { + alloc->free_blocks[j] = alloc->free_blocks[j+1]; + } + } + return; + } + // check if ptr is at the beginning of the block + if ((char*)ptr + size == block->addr) { + block->addr = ptr; + block->size += size; + // check if we can merge with the previous block + if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) { + alloc->free_blocks[i-1].size += block->size; + alloc->n_free_blocks--; + for (int j = i; j < alloc->n_free_blocks; j++) { + alloc->free_blocks[j] = alloc->free_blocks[j+1]; + } + } + return; + } + } + // otherwise, add a new block + GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks"); + // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster) + int insert_pos = 0; + while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) { + insert_pos++; + } + // shift all blocks from insert_pos onward to make room for the new block + for (int i = alloc->n_free_blocks; i > insert_pos; i--) { + alloc->free_blocks[i] = alloc->free_blocks[i-1]; + } + // insert the new block + alloc->free_blocks[insert_pos].addr = ptr; + alloc->free_blocks[insert_pos].size = size; + alloc->n_free_blocks++; +} + +void ggml_allocator_reset(struct ggml_allocator * alloc) { + alloc->n_free_blocks = 1; + size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment); + alloc->free_blocks[0].addr = (char *)alloc->data + align_offset; + alloc->free_blocks[0].size = alloc->size - align_offset; +} + +struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alignment) { + struct ggml_allocator * alloc = (struct ggml_allocator *)malloc(sizeof(struct ggml_allocator) /* + n_free_blocks * sizeof(struct free_block) */); + + *alloc = (struct ggml_allocator){ + /*.data = */ data, + /*.size = */ size, + /*.alignment = */ alignment, + /*.n_free_blocks = */ 0, + /*.free_blocks = */ {{0}}, + /*.max_size = */ 0, + /*.measure = */ false, +#ifdef GGML_ALLOCATOR_DEBUG + /*.allocated_tensors = */ = {0}, +#endif + }; + + ggml_allocator_reset(alloc); + + return alloc; +} + +// address and size of the buffer when measuring +// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers +static void * const MEASURE_BASE_ADDR = (void *) 0x1000; +static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB + +struct ggml_allocator * ggml_allocator_new_measure(size_t alignment) { + struct ggml_allocator * alloc = (struct ggml_allocator *)malloc(sizeof(struct ggml_allocator) /* + n_free_blocks * sizeof(struct free_block) */); + + *alloc = (struct ggml_allocator){ + /*.data = */ MEASURE_BASE_ADDR, + /*.size = */ MEASURE_MAX_SIZE, + /*.alignment = */ alignment, + /*.n_free_blocks = */ 0, + /*.free_blocks = */ {{0}}, + /*.max_size = */ 0, + /*.measure = */ true, +#ifdef GGML_ALLOCATOR_DEBUG + /*.allocated_tensors = */ = {0}, +#endif + }; + + ggml_allocator_reset(alloc); + + return alloc; +} + +void ggml_allocator_free(struct ggml_allocator * alloc) { + free(alloc); +} + +bool ggml_allocator_is_measure(struct ggml_allocator * alloc) { + return alloc->measure; +} + +//////////// compute graph allocator + +static bool ggml_is_view(struct ggml_tensor * t) { + return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE || + t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY; +} + +static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { + if (a->type != b->type) { + return false; + } + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (a->ne[i] != b->ne[i]) { + return false; + } + if (a->nb[i] != b->nb[i]) { + return false; + } + } + return true; +} + +static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) { + switch (t->op) { + case GGML_OP_PERMUTE: + case GGML_OP_RESHAPE: + case GGML_OP_TRANSPOSE: + case GGML_OP_VIEW: + return t->src[0]; + case GGML_OP_CPY: + return t->src[1]; + default: + return NULL; + } +} + +static struct ggml_tensor * get_view_source(struct ggml_tensor * t) { + struct ggml_tensor * parent = t; + do { + parent = get_view_parent(parent); + } while (ggml_is_view(parent)); + return parent; +} + +static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * node) { + if (node->data == NULL) { + if (ggml_is_view(node)) { + size_t offset; + switch(node->op) { + case GGML_OP_VIEW: + memcpy(&offset, node->op_params, sizeof(size_t)); + node->data = (char *) node->src[0]->data + offset; + break; + case GGML_OP_PERMUTE: + case GGML_OP_RESHAPE: + case GGML_OP_TRANSPOSE: + node->data = node->src[0]->data; + break; + case GGML_OP_CPY: + node->data = node->src[1]->data; + break; + default: + GGML_ASSERT(!"unknown view op"); + break; + } + } else { + // see if we can reuse a parent's buffer (inplace) + for (int i = 0; i < GGML_MAX_SRC; i++) { + struct ggml_tensor * parent = node->src[i]; + if (parent == NULL) { + break; + } + // TODO: make a list of operations that can be safely made inplace + if (parent->data != NULL && parent->n_children == 1 && parent->n_views == 0 && ggml_are_same_layout(node, parent) && node->op != GGML_OP_MUL_MAT) { + if (ggml_is_view(parent)) { + struct ggml_tensor * view_src = get_view_source(parent); + if (view_src->n_views == 1 && view_src->n_children == 0 && view_src->data == parent->data) { + // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite + // the parent's data that it will need later (same layout requirement). the problem is that then + // we cannot free the tensor because the original address of the allocation is lost. + // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views + // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data) + AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); + node->data = parent->data; + return; + } + } + else { + AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); + node->data = parent->data; + } + return; + } + } + ggml_allocator_alloc_tensor(alloc, node); + } + } +} + +static size_t ggml_allocator_alloc_graph_tensors_n( + struct ggml_allocator * alloc, + struct ggml_cgraph ** graphs, int n_graphs, + struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) { + + // reset counters + for (int g = 0; g < n_graphs; g++) { + struct ggml_cgraph * gf = graphs[g]; + for (int i = 0; i < gf->n_nodes; i++) { + struct ggml_tensor * node = gf->nodes[i]; + node->n_children = 0; + node->n_views = 0; + } + + for (int i = 0; i < gf->n_leafs; i++) { + struct ggml_tensor * leaf = gf->leafs[i]; + leaf->n_children = 0; + leaf->n_views = 0; + } + } + + // count number of children and views + for (int g = 0; g < n_graphs; g++) { + struct ggml_cgraph * gf = graphs[g]; + for (int i = 0; i < gf->n_nodes; i++) { + struct ggml_tensor * node = gf->nodes[i]; + + if (ggml_is_view(node)) { + struct ggml_tensor * view_src = get_view_source(node); + view_src->n_views += 1; + } + + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + parent->n_children += 1; + } + } + } + + // allocate tensors + for (int g = 0; g < n_graphs; g++) { + struct ggml_cgraph * gf = graphs[g]; + AT_PRINTF("####### graph %d/%d\n", g, n_graphs); + // graph inputs are allocated first to ensure that they are never overwritten + if (inputs != NULL && inputs[g] != NULL) { + for (int i = 0; inputs[g][i] != NULL; i++) { + struct ggml_tensor * input = inputs[g][i]; + AT_PRINTF("input: %s\n", input->name); + allocate_node(alloc, input); + } + } + for (int i = 0; i < gf->n_nodes; i++) { + struct ggml_tensor * node = gf->nodes[i]; + + // allocate parents (leafs) + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + allocate_node(alloc, parent); + } + + // allocate node + allocate_node(alloc, node); + + AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name); + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + AT_PRINTF("%s", parent->name); + if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { + AT_PRINTF(", "); + } + } + AT_PRINTF("\n"); + + // update parents + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + parent->n_children -= 1; + + //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views); + + if (parent->n_children == 0 && parent->n_views == 0) { + if (ggml_is_view(parent)) { + struct ggml_tensor * view_src = get_view_source(parent); + view_src->n_views -= 1; + AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views); + if (view_src->n_views == 0 && view_src->n_children == 0 && view_src->data != node->data) { + ggml_allocator_free_tensor(alloc, view_src); + } + } + else { + if (parent->data != node->data) { + ggml_allocator_free_tensor(alloc, parent); + } + } + } + } + AT_PRINTF("\n"); + } + // free graph outputs here that wouldn't be freed otherwise because they have no children + if (outputs != NULL && outputs[g] != NULL) { + for (int i = 0; outputs[g][i] != NULL; i++) { + struct ggml_tensor * output = outputs[g][i]; + AT_PRINTF("output: %s\n", output->name); + ggml_allocator_free_tensor(alloc, output); + } + } + } + + return alloc->max_size; +} + +size_t ggml_allocator_alloc_graph_tensors(struct ggml_allocator * alloc, struct ggml_cgraph * graph) { + return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL); +} diff --git a/ggml-alloc.h b/ggml-alloc.h new file mode 100644 index 0000000000000..716d74642b156 --- /dev/null +++ b/ggml-alloc.h @@ -0,0 +1,21 @@ +#pragma once + +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +GGML_API struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alignment); +GGML_API struct ggml_allocator * ggml_allocator_new_measure(size_t alignment); +GGML_API void ggml_allocator_free(struct ggml_allocator * alloc); +GGML_API bool ggml_allocator_is_measure(struct ggml_allocator * alloc); +GGML_API void ggml_allocator_reset(struct ggml_allocator * alloc); +GGML_API void ggml_allocator_alloc_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor); +GGML_API size_t ggml_allocator_alloc_graph_tensors(struct ggml_allocator * alloc, struct ggml_cgraph * graph); + + +#ifdef __cplusplus +} +#endif diff --git a/ggml.c b/ggml.c index b77f9926754ed..1999abe6fb755 100644 --- a/ggml.c +++ b/ggml.c @@ -4610,6 +4610,8 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data, /*.name =*/ { 0 }, /*.extra =*/ NULL, + /*.n_children =*/ 0, + /*.n_views =*/ 0, /*.padding =*/ { 0 }, }; @@ -6741,6 +6743,18 @@ struct ggml_tensor * ggml_rope_inplace( return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true); } +struct ggml_tensor * ggml_rope_custom( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx, + float freq_base, + float freq_scale) { + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false); +} + struct ggml_tensor * ggml_rope_custom_inplace( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/ggml.h b/ggml.h index 9919cce7c263f..12422eda1c83c 100644 --- a/ggml.h +++ b/ggml.h @@ -451,7 +451,11 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - char padding[4]; + // temp - used by allocator + int n_children; + int n_views; + + char padding[16]; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); @@ -1170,7 +1174,18 @@ extern "C" { int mode, int n_ctx); - // custom RoPE, in-place, returns view(a) + // custom RoPE + GGML_API struct ggml_tensor * ggml_rope_custom( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx, + float freq_base, + float freq_scale); + + // in-place, returns view(a) GGML_API struct ggml_tensor * ggml_rope_custom_inplace( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/llama.cpp b/llama.cpp index 9a8ecdcf6f7f1..9f12cc0d956c0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -56,7 +56,13 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -#define LLAMA_USE_SCRATCH +#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_CLBLAST) && !defined(GGML_USE_METAL) +# include "ggml-alloc.h" +# define LLAMA_USE_ALLOCATOR +#else +# define LLAMA_USE_SCRATCH +#endif + #define LLAMA_MAX_SCRATCH_BUFFERS 16 // available llama models @@ -371,7 +377,17 @@ struct llama_context { // memory buffers used to evaluate the model // TODO: move in llama_state llama_ctx_buffer buf_compute; + +#ifdef LLAMA_USE_ALLOCATOR + llama_ctx_buffer buf_alloc; + ggml_allocator * alloc = NULL; +#endif + +#ifdef LLAMA_USE_SCRATCH llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; + int buf_last = 0; + size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; +#endif #ifdef GGML_USE_METAL ggml_metal_context * ctx_metal = NULL; @@ -381,9 +397,6 @@ struct llama_context { ggml_mpi_context * ctx_mpi = NULL; #endif - int buf_last = 0; - size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; - void use_buf(struct ggml_context * ctx, int i) { #if defined(LLAMA_USE_SCRATCH) size_t last_size = 0; @@ -1360,32 +1373,15 @@ static bool llama_model_load( } } -// evaluate the transformer -// -// - lctx: llama context -// - tokens: new batch of tokens to process -// - embd embeddings input -// - n_tokens number of tokens -// - n_past: the context size so far -// - n_threads: number of threads to use -// -static bool llama_eval_internal( +static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_token * tokens, const float * embd, int n_tokens, - int n_past, - int n_threads, - const char * cgraph_fname) { + int n_past) { LLAMA_ASSERT((!tokens && embd) || (tokens && !embd)); -#ifdef GGML_USE_MPI - ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads); -#endif - - const int64_t t_start_us = ggml_time_us(); - const int N = n_tokens; const auto & model = lctx.model; @@ -1401,10 +1397,9 @@ static bool llama_eval_internal( const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_vocab = hparams.n_vocab; + //const int64_t n_vocab = hparams.n_vocab; const int64_t n_embd_gqa = hparams.n_embd_gqa(); - LLAMA_ASSERT(n_embd_head == hparams.n_rot); const float freq_base = hparams.rope_freq_base; @@ -1413,29 +1408,40 @@ static bool llama_eval_internal( const int n_gpu_layers = model.n_gpu_layers; - auto & mem_per_token = lctx.mem_per_token; auto & buf_compute = lctx.buf_compute; + struct ggml_init_params params = { /*.mem_size =*/ buf_compute.size, /*.mem_buffer =*/ buf_compute.addr, /*.no_alloc =*/ false, }; +#ifdef LLAMA_USE_ALLOCATOR +# define ggml_rope_custom_inplace ggml_rope_custom +# define ggml_scale_inplace ggml_scale +# define ggml_diag_mask_inf_inplace ggml_diag_mask_inf +# define ggml_soft_max_inplace ggml_soft_max + params.no_alloc = true; +#endif + struct ggml_context * ctx0 = ggml_init(params); ggml_cgraph * gf = ggml_new_graph(ctx0); - // for big prompts, if BLAS is enabled, it is better to use only one thread - // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance - n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; - struct ggml_tensor * cur; struct ggml_tensor * inpL; if (tokens) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); +#ifdef LLAMA_USE_ALLOCATOR + ggml_allocator_alloc_tensor(lctx.alloc, inp_tokens); + if (!ggml_allocator_is_measure(lctx.alloc)) { + memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); + } +#else memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); +#endif ggml_set_name(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); @@ -1472,6 +1478,17 @@ static bool llama_eval_internal( } #endif // GGML_USE_CUBLAS + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); +#ifdef LLAMA_USE_ALLOCATOR + ggml_allocator_alloc_tensor(lctx.alloc, KQ_scale); + if (!ggml_allocator_is_measure(lctx.alloc)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } +#else + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); +#endif + ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + for (int il = 0; il < n_layer; ++il) { ggml_format_name(inpL, "layer_inp_%d", il); @@ -1567,9 +1584,6 @@ static bool llama_eval_internal( ggml_set_name(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) - struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - // KQ_scaled shape [n_past + N, N, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); offload_func_kq(KQ_scaled); @@ -1700,6 +1714,9 @@ static bool llama_eval_internal( ggml_set_name(cur, "result_norm"); embeddings = cur; +#ifdef LLAMA_USE_ALLOCATOR + // TODO: ensure that embeddings is not freed +#endif } // lm_head @@ -1711,11 +1728,84 @@ static bool llama_eval_internal( // logits -> probs //cur = ggml_soft_max_inplace(ctx0, cur); - // run the computation ggml_build_forward_expand(gf, cur); + // outputs: cur, embeddings + ggml_free(ctx0); + + return gf; + +#ifdef LLAMA_USE_ALLOCATOR +# undef ggml_rope_custom +# undef ggml_scale +# undef ggml_diag_mask_inf +# undef ggml_soft_max +#endif +} + +// evaluate the transformer +// +// - lctx: llama context +// - tokens: new batch of tokens to process +// - embd embeddings input +// - n_tokens number of tokens +// - n_past: the context size so far +// - n_threads: number of threads to use +// +static bool llama_eval_internal( + llama_context & lctx, + const llama_token * tokens, + const float * embd, + int n_tokens, + int n_past, + int n_threads, + const char * cgraph_fname) { + + LLAMA_ASSERT((!tokens && embd) || (tokens && !embd)); + + const int64_t t_start_us = ggml_time_us(); + +#ifdef GGML_USE_MPI + ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads); +#endif + + const int N = n_tokens; + + const auto & model = lctx.model; + const auto & hparams = model.hparams; + + const auto & kv_self = lctx.kv_self; + + LLAMA_ASSERT(!!kv_self.ctx); + + const int64_t n_embd = hparams.n_embd; + //const int64_t n_layer = hparams.n_layer; + //const int64_t n_ctx = hparams.n_ctx; + //const int64_t n_head = hparams.n_head; + //const int64_t n_head_kv = hparams.n_head_kv; + //const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_vocab = hparams.n_vocab; + //const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + //auto & mem_per_token = lctx.mem_per_token; + +#ifdef LLAMA_USE_ALLOCATOR + ggml_allocator_reset(lctx.alloc); +#endif + + ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past); + +#ifdef LLAMA_USE_ALLOCATOR + size_t sz = ggml_allocator_alloc_graph_tensors(lctx.alloc, gf); + //fprintf(stderr, "%s: compute buffer size: %.3f MB\n", __func__, sz / 1024.0 / 1024.0); +#endif + // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs); + // for big prompts, if BLAS is enabled, it is better to use only one thread + // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance + n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; + #if GGML_USE_MPI ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); #endif @@ -1760,6 +1850,10 @@ static bool llama_eval_internal( lctx.kv_self.n = n_past + N; struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * embeddings = NULL; + + LLAMA_ASSERT(strcmp(res->name, "result_output") == 0); + //LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0); if (cgraph_fname) { ggml_graph_export(gf, cgraph_fname); @@ -1798,9 +1892,9 @@ static bool llama_eval_internal( memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); } - if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0)/N; - } + //if (mem_per_token == 0) { + // mem_per_token = ggml_used_mem(ctx0)/N; + //} #if 0 printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__, @@ -1811,8 +1905,6 @@ static bool llama_eval_internal( n_past, N); #endif - ggml_free(ctx0); - // measure the performance only for the single-token evals if (N == 1) { lctx.t_eval_us += ggml_time_us() - t_start_us; @@ -3178,10 +3270,38 @@ struct llama_context * llama_new_context_with_model( ctx->embedding.resize(hparams.n_embd); } +#ifdef LLAMA_USE_ALLOCATOR + ctx->buf_compute.resize(ggml_tensor_overhead() * 3072 + ggml_graph_overhead()); + + // measure memory requirements for worst-case graph + ctx->alloc = ggml_allocator_new_measure(32); + + // build worst-case graph + int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); + int n_past = hparams.n_ctx - n_tokens; + std::vector tokens(n_tokens, llama_token_bos()); + ggml_cgraph * gf = llama_build_graph(*ctx, tokens.data(), NULL, n_tokens, n_past); + + size_t size = ggml_allocator_alloc_graph_tensors(ctx->alloc, gf); + fprintf(stderr, "%s: worst-case graph size = %7.2f MB\n", __func__, size / 1024.0 / 1024.0); + fprintf(stderr, "%s: compute buffer total size: %7.2f MB\n", __func__, (ctx->buf_compute.size + size) / 1024.0 / 1024.0); + + size_t prev_req = MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) + MEM_REQ_SCRATCH1().at(ctx->model.type) + MEM_REQ_EVAL().at(ctx->model.type); + fprintf(stderr, "%s: equivalent with scratch buffer: %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0); + + + // recreate allocator with exact memory requirements + ggml_allocator_free(ctx->alloc); + ctx->buf_alloc.resize(size); + ctx->alloc = ggml_allocator_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, 32); +#else ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead()); +#endif +#ifdef LLAMA_USE_SCRATCH ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type)); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); +#endif } #ifdef GGML_USE_METAL From 598a9ada8fccd6de5c15e6d692cefdc5109bd896 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 27 Jul 2023 12:14:51 +0200 Subject: [PATCH 02/13] adjust buffer size to account for alignment --- llama.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/llama.cpp b/llama.cpp index 9f12cc0d956c0..f478aca22e95a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3271,18 +3271,19 @@ struct llama_context * llama_new_context_with_model( } #ifdef LLAMA_USE_ALLOCATOR + static const size_t tensor_alignment = 32; ctx->buf_compute.resize(ggml_tensor_overhead() * 3072 + ggml_graph_overhead()); // measure memory requirements for worst-case graph - ctx->alloc = ggml_allocator_new_measure(32); + ctx->alloc = ggml_allocator_new_measure(tensor_alignment); // build worst-case graph int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); int n_past = hparams.n_ctx - n_tokens; - std::vector tokens(n_tokens, llama_token_bos()); - ggml_cgraph * gf = llama_build_graph(*ctx, tokens.data(), NULL, n_tokens, n_past); + llama_token token = llama_token_bos(); + ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); - size_t size = ggml_allocator_alloc_graph_tensors(ctx->alloc, gf); + size_t size = ggml_allocator_alloc_graph_tensors(ctx->alloc, gf) + tensor_alignment; fprintf(stderr, "%s: worst-case graph size = %7.2f MB\n", __func__, size / 1024.0 / 1024.0); fprintf(stderr, "%s: compute buffer total size: %7.2f MB\n", __func__, (ctx->buf_compute.size + size) / 1024.0 / 1024.0); @@ -3293,7 +3294,7 @@ struct llama_context * llama_new_context_with_model( // recreate allocator with exact memory requirements ggml_allocator_free(ctx->alloc); ctx->buf_alloc.resize(size); - ctx->alloc = ggml_allocator_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, 32); + ctx->alloc = ggml_allocator_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment); #else ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead()); #endif From 8afe3923984844bd994506ca48091c92e2e7b0fd Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 27 Jul 2023 12:15:49 +0200 Subject: [PATCH 03/13] fix mpi build --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index f478aca22e95a..f34f6d0e921d4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1779,7 +1779,7 @@ static bool llama_eval_internal( LLAMA_ASSERT(!!kv_self.ctx); const int64_t n_embd = hparams.n_embd; - //const int64_t n_layer = hparams.n_layer; + const int64_t n_layer = hparams.n_layer; //const int64_t n_ctx = hparams.n_ctx; //const int64_t n_head = hparams.n_head; //const int64_t n_head_kv = hparams.n_head_kv; From 8fa548377a1ee5de19f47494447dd81724d6f9d3 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 27 Jul 2023 12:18:03 +0200 Subject: [PATCH 04/13] allow using the allocator with opencl --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index f34f6d0e921d4..6815c21965680 100644 --- a/llama.cpp +++ b/llama.cpp @@ -56,7 +56,7 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_CLBLAST) && !defined(GGML_USE_METAL) +#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL) # include "ggml-alloc.h" # define LLAMA_USE_ALLOCATOR #else From f67179aaf2eaffabd49c324aeb632257a3a5889b Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 27 Jul 2023 16:11:32 +0200 Subject: [PATCH 05/13] add list of ops that support in-place --- ggml-alloc.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/ggml-alloc.c b/ggml-alloc.c index 20ab23cf78e9d..fe24c05a15083 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -305,6 +305,33 @@ static struct ggml_tensor * get_view_source(struct ggml_tensor * t) { return parent; } +bool ggml_op_can_inplace(enum ggml_op op) { + switch (op) { + case GGML_OP_SCALE: + case GGML_OP_DIAG_MASK_ZERO: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_ADD: + case GGML_OP_ADD1: + case GGML_OP_ACC: + case GGML_OP_SUB: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_UNARY: + case GGML_OP_ROPE: + case GGML_OP_RMS_NORM: + case GGML_OP_SET: + case GGML_OP_SOFT_MAX: + case GGML_OP_CONT: + return true; + + default: + return false; + } +} + static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * node) { if (node->data == NULL) { if (ggml_is_view(node)) { @@ -333,8 +360,7 @@ static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * no if (parent == NULL) { break; } - // TODO: make a list of operations that can be safely made inplace - if (parent->data != NULL && parent->n_children == 1 && parent->n_views == 0 && ggml_are_same_layout(node, parent) && node->op != GGML_OP_MUL_MAT) { + if (parent->data != NULL && parent->n_children == 1 && parent->n_views == 0 && ggml_are_same_layout(node, parent) && ggml_op_can_inplace(node->op)) { if (ggml_is_view(parent)) { struct ggml_tensor * view_src = get_view_source(parent); if (view_src->n_views == 1 && view_src->n_children == 0 && view_src->data == parent->data) { From 64584d56a78db9f0ded5f71570cf526cc4d6f693 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 27 Jul 2023 17:46:05 +0200 Subject: [PATCH 06/13] ggml : don't calculate data pointer of unallocated tensors when creating a view with an offset --- ggml.c | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/ggml.c b/ggml.c index 1999abe6fb755..3339cb2548aa1 100644 --- a/ggml.c +++ b/ggml.c @@ -4557,10 +4557,12 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml static struct ggml_tensor * ggml_new_tensor_impl( struct ggml_context * ctx, - enum ggml_type type, - int n_dims, - const int64_t* ne, - void* data) { + enum ggml_type type, + int n_dims, + const int64_t * ne, + void * data) { + + assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS); size_t data_size = 0; @@ -4650,22 +4652,22 @@ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int3 struct ggml_tensor * ggml_new_tensor( struct ggml_context * ctx, - enum ggml_type type, - int n_dims, - const int64_t * ne) { + enum ggml_type type, + int n_dims, + const int64_t * ne) { return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL); } struct ggml_tensor * ggml_new_tensor_1d( struct ggml_context * ctx, - enum ggml_type type, + enum ggml_type type, int64_t ne0) { return ggml_new_tensor(ctx, type, 1, &ne0); } struct ggml_tensor * ggml_new_tensor_2d( struct ggml_context * ctx, - enum ggml_type type, + enum ggml_type type, int64_t ne0, int64_t ne1) { const int64_t ne[2] = { ne0, ne1 }; @@ -4674,7 +4676,7 @@ struct ggml_tensor * ggml_new_tensor_2d( struct ggml_tensor * ggml_new_tensor_3d( struct ggml_context * ctx, - enum ggml_type type, + enum ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) { @@ -6240,6 +6242,20 @@ struct ggml_tensor * ggml_reshape_4d( // ggml_view_1d +static struct ggml_tensor * ggml_view_tensor_offset( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_dims, + const int64_t * ne, + size_t offset) { + // don't calculate an offset from an unallocated tensor + void * data = NULL; + if (a->data != NULL) { + data = (char *) a->data + offset; + } + return ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data); +} + struct ggml_tensor * ggml_view_1d( struct ggml_context * ctx, struct ggml_tensor * a, @@ -6252,7 +6268,7 @@ struct ggml_tensor * ggml_view_1d( is_node = true; } - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); + struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset); ggml_format_name(result, "%s (view)", a->name); ggml_set_op_params(result, &offset, sizeof(offset)); @@ -6282,7 +6298,8 @@ struct ggml_tensor * ggml_view_2d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset); + struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset); + ggml_format_name(result, "%s (view)", a->name); ggml_set_op_params(result, &offset, sizeof(offset)); @@ -6318,7 +6335,8 @@ struct ggml_tensor * ggml_view_3d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); + struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset); + ggml_format_name(result, "%s (view)", a->name); ggml_set_op_params(result, &offset, sizeof(offset)); @@ -6356,7 +6374,7 @@ struct ggml_tensor * ggml_view_4d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset); + struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset); ggml_format_name(result, "%s (view)", a->name); ggml_set_op_params(result, &offset, sizeof(offset)); From af7bd42b2a2c47d5259a4a3e878676fd60fb1319 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 27 Jul 2023 18:02:53 +0200 Subject: [PATCH 07/13] llama.cpp : free allocator when deleting context, cleanup --- llama.cpp | 68 +++++++++++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/llama.cpp b/llama.cpp index 6815c21965680..a8c8014a0620e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -57,13 +57,13 @@ #endif #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL) -# include "ggml-alloc.h" -# define LLAMA_USE_ALLOCATOR +#include "ggml-alloc.h" +#define LLAMA_USE_ALLOCATOR #else -# define LLAMA_USE_SCRATCH +#define LLAMA_USE_SCRATCH +#define LLAMA_MAX_SCRATCH_BUFFERS 16 #endif -#define LLAMA_MAX_SCRATCH_BUFFERS 16 // available llama models enum e_model { @@ -333,13 +333,22 @@ struct llama_model { struct llama_context { llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {} -#ifdef GGML_USE_METAL ~llama_context() { + if (model_owner) { + delete &model; + } +#ifdef GGML_USE_METAL if (ctx_metal) { ggml_metal_free(ctx_metal); } - } #endif +#ifdef LLAMA_USE_ALLOCATOR + if (alloc) { + ggml_allocator_free(alloc); + } +#endif + } + std::mt19937 rng; bool has_evaluated_once = false; @@ -1397,7 +1406,6 @@ static struct ggml_cgraph * llama_build_graph( const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); - //const int64_t n_vocab = hparams.n_vocab; const int64_t n_embd_gqa = hparams.n_embd_gqa(); LLAMA_ASSERT(n_embd_head == hparams.n_rot); @@ -1408,6 +1416,7 @@ static struct ggml_cgraph * llama_build_graph( const int n_gpu_layers = model.n_gpu_layers; + auto & mem_per_token = lctx.mem_per_token; auto & buf_compute = lctx.buf_compute; @@ -1730,9 +1739,22 @@ static struct ggml_cgraph * llama_build_graph( ggml_build_forward_expand(gf, cur); - // outputs: cur, embeddings + if (mem_per_token == 0) { + mem_per_token = ggml_used_mem(ctx0)/N; + } + +#if 0 + printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__, + ggml_used_mem(ctx0)/1024.0/1024.0, + lctx.get_buf_max_mem(0)/1024.0/1024.0, + lctx.get_buf_max_mem(1)/1024.0/1024.0, + lctx.work_buffer.size()/1024.0/1024.0, + n_past, N); +#endif + ggml_free(ctx0); + // outputs: cur, embeddings return gf; #ifdef LLAMA_USE_ALLOCATOR @@ -1779,15 +1801,7 @@ static bool llama_eval_internal( LLAMA_ASSERT(!!kv_self.ctx); const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - //const int64_t n_ctx = hparams.n_ctx; - //const int64_t n_head = hparams.n_head; - //const int64_t n_head_kv = hparams.n_head_kv; - //const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_vocab = hparams.n_vocab; - //const int64_t n_embd_gqa = hparams.n_embd_gqa(); - - //auto & mem_per_token = lctx.mem_per_token; #ifdef LLAMA_USE_ALLOCATOR ggml_allocator_reset(lctx.alloc); @@ -1796,8 +1810,7 @@ static bool llama_eval_internal( ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past); #ifdef LLAMA_USE_ALLOCATOR - size_t sz = ggml_allocator_alloc_graph_tensors(lctx.alloc, gf); - //fprintf(stderr, "%s: compute buffer size: %.3f MB\n", __func__, sz / 1024.0 / 1024.0); + ggml_allocator_alloc_graph_tensors(lctx.alloc, gf); #endif // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs); @@ -1807,6 +1820,7 @@ static bool llama_eval_internal( n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; #if GGML_USE_MPI + const int64_t n_layer = hparams.n_layer; ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); #endif @@ -1892,19 +1906,6 @@ static bool llama_eval_internal( memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); } - //if (mem_per_token == 0) { - // mem_per_token = ggml_used_mem(ctx0)/N; - //} - -#if 0 - printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__, - ggml_used_mem(ctx0)/1024.0/1024.0, - lctx.get_buf_max_mem(0)/1024.0/1024.0, - lctx.get_buf_max_mem(1)/1024.0/1024.0, - lctx.work_buffer.size()/1024.0/1024.0, - n_past, N); -#endif - // measure the performance only for the single-token evals if (N == 1) { lctx.t_eval_us += ggml_time_us() - t_start_us; @@ -3272,7 +3273,7 @@ struct llama_context * llama_new_context_with_model( #ifdef LLAMA_USE_ALLOCATOR static const size_t tensor_alignment = 32; - ctx->buf_compute.resize(ggml_tensor_overhead() * 3072 + ggml_graph_overhead()); + ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); // measure memory requirements for worst-case graph ctx->alloc = ggml_allocator_new_measure(tensor_alignment); @@ -3372,9 +3373,6 @@ struct llama_context * llama_init_from_file( } void llama_free(struct llama_context * ctx) { - if (ctx->model_owner) { - delete &ctx->model; - } delete ctx; } From e39e62ba4a594873ceb44c89709b5f9d1bdfea14 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 27 Jul 2023 18:34:21 +0200 Subject: [PATCH 08/13] replace n_views and n_children in ggml_tensor with a hash table in the allocator --- ggml-alloc.c | 73 +++++++++++++++++++++++++++++++++++----------------- ggml.c | 2 -- ggml.h | 6 +---- llama.cpp | 2 +- 4 files changed, 51 insertions(+), 32 deletions(-) diff --git a/ggml-alloc.c b/ggml-alloc.c index fe24c05a15083..d7c7978e48e74 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -14,6 +14,35 @@ //#define AT_PRINTF printf #define AT_PRINTF(...) ((void)0) +struct hash_node { + struct ggml_tensor * t; + int n_children; + int n_views; +}; + +static size_t hash(void * p) { + return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE; +} + +static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) { + size_t h = hash(t); + + // linear probing + size_t i = h; + while (hash_table[i].t != NULL) { + if (hash_table[i].t == t) { + return &hash_table[i]; + } + i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE; + if (i == h) { + // hash table is full + GGML_ASSERT(false); + } + } + + hash_table[i].t = t; + return &hash_table[i]; +} // TODO: GGML_PAD ? static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { @@ -35,6 +64,7 @@ struct ggml_allocator { size_t alignment; int n_free_blocks; struct free_block free_blocks[MAX_FREE_BLOCKS]; + struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE]; size_t max_size; bool measure; @@ -215,6 +245,7 @@ struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alig /*.alignment = */ alignment, /*.n_free_blocks = */ 0, /*.free_blocks = */ {{0}}, + /*.hash_table = */ {{0}}, /*.max_size = */ 0, /*.measure = */ false, #ifdef GGML_ALLOCATOR_DEBUG @@ -241,6 +272,7 @@ struct ggml_allocator * ggml_allocator_new_measure(size_t alignment) { /*.alignment = */ alignment, /*.n_free_blocks = */ 0, /*.free_blocks = */ {{0}}, + /*.hash_table = */ {{0}}, /*.max_size = */ 0, /*.measure = */ true, #ifdef GGML_ALLOCATOR_DEBUG @@ -305,7 +337,7 @@ static struct ggml_tensor * get_view_source(struct ggml_tensor * t) { return parent; } -bool ggml_op_can_inplace(enum ggml_op op) { +static bool ggml_op_can_inplace(enum ggml_op op) { switch (op) { case GGML_OP_SCALE: case GGML_OP_DIAG_MASK_ZERO: @@ -333,6 +365,7 @@ bool ggml_op_can_inplace(enum ggml_op op) { } static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * node) { + struct hash_node * ht = alloc->hash_table; if (node->data == NULL) { if (ggml_is_view(node)) { size_t offset; @@ -360,10 +393,12 @@ static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * no if (parent == NULL) { break; } - if (parent->data != NULL && parent->n_children == 1 && parent->n_views == 0 && ggml_are_same_layout(node, parent) && ggml_op_can_inplace(node->op)) { + struct hash_node * p_hn = hash_get(ht, parent); + if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent) && ggml_op_can_inplace(node->op)) { if (ggml_is_view(parent)) { struct ggml_tensor * view_src = get_view_source(parent); - if (view_src->n_views == 1 && view_src->n_children == 0 && view_src->data == parent->data) { + struct hash_node * view_src_hn = hash_get(ht, view_src); + if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite // the parent's data that it will need later (same layout requirement). the problem is that then // we cannot free the tensor because the original address of the allocation is lost. @@ -391,21 +426,9 @@ static size_t ggml_allocator_alloc_graph_tensors_n( struct ggml_cgraph ** graphs, int n_graphs, struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) { - // reset counters - for (int g = 0; g < n_graphs; g++) { - struct ggml_cgraph * gf = graphs[g]; - for (int i = 0; i < gf->n_nodes; i++) { - struct ggml_tensor * node = gf->nodes[i]; - node->n_children = 0; - node->n_views = 0; - } - - for (int i = 0; i < gf->n_leafs; i++) { - struct ggml_tensor * leaf = gf->leafs[i]; - leaf->n_children = 0; - leaf->n_views = 0; - } - } + // reset hash table + struct hash_node * ht = alloc->hash_table; + memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE); // count number of children and views for (int g = 0; g < n_graphs; g++) { @@ -415,7 +438,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n( if (ggml_is_view(node)) { struct ggml_tensor * view_src = get_view_source(node); - view_src->n_views += 1; + hash_get(ht, view_src)->n_views += 1; } for (int j = 0; j < GGML_MAX_SRC; j++) { @@ -423,7 +446,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n( if (parent == NULL) { break; } - parent->n_children += 1; + hash_get(ht, parent)->n_children += 1; } } } @@ -474,16 +497,18 @@ static size_t ggml_allocator_alloc_graph_tensors_n( if (parent == NULL) { break; } - parent->n_children -= 1; + struct hash_node * p_hn = hash_get(ht, parent); + p_hn->n_children -= 1; //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views); - if (parent->n_children == 0 && parent->n_views == 0) { + if (p_hn->n_children == 0 && p_hn->n_views == 0) { if (ggml_is_view(parent)) { struct ggml_tensor * view_src = get_view_source(parent); - view_src->n_views -= 1; + struct hash_node * view_src_hn = hash_get(ht, view_src); + view_src_hn->n_views -= 1; AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views); - if (view_src->n_views == 0 && view_src->n_children == 0 && view_src->data != node->data) { + if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) { ggml_allocator_free_tensor(alloc, view_src); } } diff --git a/ggml.c b/ggml.c index 3339cb2548aa1..a0626264efee2 100644 --- a/ggml.c +++ b/ggml.c @@ -4612,8 +4612,6 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data, /*.name =*/ { 0 }, /*.extra =*/ NULL, - /*.n_children =*/ 0, - /*.n_views =*/ 0, /*.padding =*/ { 0 }, }; diff --git a/ggml.h b/ggml.h index 12422eda1c83c..aba92480c833c 100644 --- a/ggml.h +++ b/ggml.h @@ -451,11 +451,7 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - // temp - used by allocator - int n_children; - int n_views; - - char padding[16]; + char padding[4]; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); diff --git a/llama.cpp b/llama.cpp index a8c8014a0620e..c376d6517ecc2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1813,7 +1813,7 @@ static bool llama_eval_internal( ggml_allocator_alloc_graph_tensors(lctx.alloc, gf); #endif - // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs); + // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance From e592a17a756095a453503385564bf4fa8e93ee97 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 27 Jul 2023 18:40:52 +0200 Subject: [PATCH 09/13] ggml : refactor ggml_view_Nd into ggml_view_tensor_offset --- ggml.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/ggml.c b/ggml.c index a0626264efee2..fa0f98aa09df2 100644 --- a/ggml.c +++ b/ggml.c @@ -6251,7 +6251,14 @@ static struct ggml_tensor * ggml_view_tensor_offset( if (a->data != NULL) { data = (char *) a->data + offset; } - return ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data); + + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data); + + ggml_format_name(result, "%s (view)", a->name); + + ggml_set_op_params(result, &offset, sizeof(offset)); + + return result; } struct ggml_tensor * ggml_view_1d( @@ -6267,9 +6274,6 @@ struct ggml_tensor * ggml_view_1d( } struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset); - ggml_format_name(result, "%s (view)", a->name); - - ggml_set_op_params(result, &offset, sizeof(offset)); result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6298,10 +6302,6 @@ struct ggml_tensor * ggml_view_2d( struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset); - ggml_format_name(result, "%s (view)", a->name); - - ggml_set_op_params(result, &offset, sizeof(offset)); - result->nb[1] = nb1; result->nb[2] = result->nb[1]*ne1; result->nb[3] = result->nb[2]; @@ -6335,10 +6335,6 @@ struct ggml_tensor * ggml_view_3d( struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset); - ggml_format_name(result, "%s (view)", a->name); - - ggml_set_op_params(result, &offset, sizeof(offset)); - result->nb[1] = nb1; result->nb[2] = nb2; result->nb[3] = result->nb[2]*ne2; @@ -6373,9 +6369,6 @@ struct ggml_tensor * ggml_view_4d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 }; struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset); - ggml_format_name(result, "%s (view)", a->name); - - ggml_set_op_params(result, &offset, sizeof(offset)); result->nb[1] = nb1; result->nb[2] = nb2; From ba0ab56b63077cb4d0231236cda6955016c498f2 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 27 Jul 2023 18:54:06 +0200 Subject: [PATCH 10/13] llama.cpp : fix embeddings output --- llama.cpp | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/llama.cpp b/llama.cpp index c376d6517ecc2..3ae2a895edf66 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1708,9 +1708,6 @@ static struct ggml_cgraph * llama_build_graph( lctx.use_buf(ctx0, 0); - // used at the end to optionally extract the embeddings - struct ggml_tensor * embeddings = NULL; - // norm { cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); @@ -1721,11 +1718,6 @@ static struct ggml_cgraph * llama_build_graph( cur = ggml_mul(ctx0, cur, model.norm); // offload_func_nr(cur); // TODO CPU + GPU mirrored backend ggml_set_name(cur, "result_norm"); - - embeddings = cur; -#ifdef LLAMA_USE_ALLOCATOR - // TODO: ensure that embeddings is not freed -#endif } // lm_head @@ -1754,7 +1746,6 @@ static struct ggml_cgraph * llama_build_graph( ggml_free(ctx0); - // outputs: cur, embeddings return gf; #ifdef LLAMA_USE_ALLOCATOR @@ -1864,10 +1855,10 @@ static bool llama_eval_internal( lctx.kv_self.n = n_past + N; struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; - struct ggml_tensor * embeddings = NULL; + struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; LLAMA_ASSERT(strcmp(res->name, "result_output") == 0); - //LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0); + LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0); if (cgraph_fname) { ggml_graph_export(gf, cgraph_fname); From 966c069b3fe49148b805d5ef4d2bf3fb6043c263 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 27 Jul 2023 19:03:31 +0200 Subject: [PATCH 11/13] llama.cpp : fix embeddings input --- llama.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/llama.cpp b/llama.cpp index 3ae2a895edf66..02582c4838a0b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1443,6 +1443,7 @@ static struct ggml_cgraph * llama_build_graph( if (tokens) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + #ifdef LLAMA_USE_ALLOCATOR ggml_allocator_alloc_tensor(lctx.alloc, inp_tokens); if (!ggml_allocator_is_measure(lctx.alloc)) { @@ -1460,7 +1461,15 @@ static struct ggml_cgraph * llama_build_graph( #endif inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); + +#ifdef LLAMA_USE_ALLOCATOR + ggml_allocator_alloc_tensor(lctx.alloc, inpL); + if (!ggml_allocator_is_measure(lctx.alloc)) { + memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); + } +#else memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); +#endif } const int i_gpu_start = n_layer - n_gpu_layers; From cd4a8cd28c05458a8c8ee37a7161a034a297d28d Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 28 Jul 2023 00:36:48 +0200 Subject: [PATCH 12/13] llama.cpp : better memory usage prints with allocator --- llama.cpp | 52 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/llama.cpp b/llama.cpp index 02582c4838a0b..6c57a2be9da38 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1252,12 +1252,16 @@ static void llama_model_load_internal( const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; // this is the total memory required to run the inference - const size_t mem_required = + size_t mem_required = ctx_size + - mmapped_size - vram_weights + // weights in VRAM not in memory + mmapped_size - vram_weights; // weights in VRAM not in memory + +#ifndef LLAMA_USE_ALLOCATOR + mem_required += MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) + MEM_REQ_SCRATCH1().at(model.type) + MEM_REQ_EVAL().at(model.type); +#endif // this is the memory required by one llama_state const size_t mem_required_state = @@ -3272,30 +3276,38 @@ struct llama_context * llama_new_context_with_model( } #ifdef LLAMA_USE_ALLOCATOR - static const size_t tensor_alignment = 32; - ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); + { + static const size_t tensor_alignment = 32; + // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data + ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); - // measure memory requirements for worst-case graph - ctx->alloc = ggml_allocator_new_measure(tensor_alignment); + // create measure allocator + ctx->alloc = ggml_allocator_new_measure(tensor_alignment); - // build worst-case graph - int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); - int n_past = hparams.n_ctx - n_tokens; - llama_token token = llama_token_bos(); - ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); + // build worst-case graph + int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); + int n_past = hparams.n_ctx - n_tokens; + llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); - size_t size = ggml_allocator_alloc_graph_tensors(ctx->alloc, gf) + tensor_alignment; - fprintf(stderr, "%s: worst-case graph size = %7.2f MB\n", __func__, size / 1024.0 / 1024.0); - fprintf(stderr, "%s: compute buffer total size: %7.2f MB\n", __func__, (ctx->buf_compute.size + size) / 1024.0 / 1024.0); + // measure memory requirements for the graph + size_t alloc_size = ggml_allocator_alloc_graph_tensors(ctx->alloc, gf) + tensor_alignment; - size_t prev_req = MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) + MEM_REQ_SCRATCH1().at(ctx->model.type) + MEM_REQ_EVAL().at(ctx->model.type); - fprintf(stderr, "%s: equivalent with scratch buffer: %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0); + fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); + // debug - for comparison with scratch buffer + //size_t prev_req = + // MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) + + // MEM_REQ_SCRATCH1().at(ctx->model.type) + + // MEM_REQ_EVAL().at(ctx->model.type); + //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0); - // recreate allocator with exact memory requirements - ggml_allocator_free(ctx->alloc); - ctx->buf_alloc.resize(size); - ctx->alloc = ggml_allocator_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment); + // recreate allocator with exact memory requirements + ggml_allocator_free(ctx->alloc); + + ctx->buf_alloc.resize(alloc_size); + ctx->alloc = ggml_allocator_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment); + } #else ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead()); #endif From 570aa7ceeb780ee593da7d42a61d77b84b880e47 Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 29 Jul 2023 15:01:43 +0200 Subject: [PATCH 13/13] rename ggml_allocator to ggml_allocr cleanup ggml-ci --- ggml-alloc.c | 88 +++++++++++++++++++++++++++------------------------- ggml-alloc.h | 15 ++++----- llama.cpp | 39 +++++++++-------------- 3 files changed, 67 insertions(+), 75 deletions(-) diff --git a/ggml-alloc.c b/ggml-alloc.c index d7c7978e48e74..5e1be61ff6cef 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -58,7 +58,7 @@ struct free_block { #define MAX_FREE_BLOCKS 128 -struct ggml_allocator { +struct ggml_allocr { void * data; size_t size; size_t alignment; @@ -97,13 +97,13 @@ static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_t #endif -static size_t ggml_allocator_get_alloc_size(struct ggml_allocator * alloc, struct ggml_tensor * tensor) { +static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { return ggml_nbytes(tensor); UNUSED(alloc); } -void ggml_allocator_alloc_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) { +void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { size_t size = ggml_allocator_get_alloc_size(alloc, tensor); size = aligned_offset(NULL, size, alloc->alignment); @@ -163,7 +163,7 @@ void ggml_allocator_alloc_tensor(struct ggml_allocator * alloc, struct ggml_tens } // this is a very naive implementation, but for our case the number of free blocks should be very small -static void ggml_allocator_free_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) { +static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { void * ptr = tensor->data; if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) { @@ -229,17 +229,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocator * alloc, struct ggm alloc->n_free_blocks++; } -void ggml_allocator_reset(struct ggml_allocator * alloc) { +void ggml_allocr_reset(struct ggml_allocr * alloc) { alloc->n_free_blocks = 1; size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment); alloc->free_blocks[0].addr = (char *)alloc->data + align_offset; alloc->free_blocks[0].size = alloc->size - align_offset; } -struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alignment) { - struct ggml_allocator * alloc = (struct ggml_allocator *)malloc(sizeof(struct ggml_allocator) /* + n_free_blocks * sizeof(struct free_block) */); +struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) { + struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); - *alloc = (struct ggml_allocator){ + *alloc = (struct ggml_allocr){ /*.data = */ data, /*.size = */ size, /*.alignment = */ alignment, @@ -253,7 +253,7 @@ struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alig #endif }; - ggml_allocator_reset(alloc); + ggml_allocr_reset(alloc); return alloc; } @@ -263,10 +263,10 @@ struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alig static void * const MEASURE_BASE_ADDR = (void *) 0x1000; static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB -struct ggml_allocator * ggml_allocator_new_measure(size_t alignment) { - struct ggml_allocator * alloc = (struct ggml_allocator *)malloc(sizeof(struct ggml_allocator) /* + n_free_blocks * sizeof(struct free_block) */); +struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { + struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); - *alloc = (struct ggml_allocator){ + *alloc = (struct ggml_allocr){ /*.data = */ MEASURE_BASE_ADDR, /*.size = */ MEASURE_MAX_SIZE, /*.alignment = */ alignment, @@ -280,16 +280,16 @@ struct ggml_allocator * ggml_allocator_new_measure(size_t alignment) { #endif }; - ggml_allocator_reset(alloc); + ggml_allocr_reset(alloc); return alloc; } -void ggml_allocator_free(struct ggml_allocator * alloc) { +void ggml_allocr_free(struct ggml_allocr * alloc) { free(alloc); } -bool ggml_allocator_is_measure(struct ggml_allocator * alloc) { +bool ggml_allocr_is_measure(struct ggml_allocr * alloc) { return alloc->measure; } @@ -364,7 +364,7 @@ static bool ggml_op_can_inplace(enum ggml_op op) { } } -static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * node) { +static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) { struct hash_node * ht = alloc->hash_table; if (node->data == NULL) { if (ggml_is_view(node)) { @@ -388,41 +388,43 @@ static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * no } } else { // see if we can reuse a parent's buffer (inplace) - for (int i = 0; i < GGML_MAX_SRC; i++) { - struct ggml_tensor * parent = node->src[i]; - if (parent == NULL) { - break; - } - struct hash_node * p_hn = hash_get(ht, parent); - if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent) && ggml_op_can_inplace(node->op)) { - if (ggml_is_view(parent)) { - struct ggml_tensor * view_src = get_view_source(parent); - struct hash_node * view_src_hn = hash_get(ht, view_src); - if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { - // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite - // the parent's data that it will need later (same layout requirement). the problem is that then - // we cannot free the tensor because the original address of the allocation is lost. - // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views - // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data) - AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); + if (ggml_op_can_inplace(node->op)) { + for (int i = 0; i < GGML_MAX_SRC; i++) { + struct ggml_tensor * parent = node->src[i]; + if (parent == NULL) { + break; + } + struct hash_node * p_hn = hash_get(ht, parent); + if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) { + if (ggml_is_view(parent)) { + struct ggml_tensor * view_src = get_view_source(parent); + struct hash_node * view_src_hn = hash_get(ht, view_src); + if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { + // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite + // the parent's data that it will need later (same layout requirement). the problem is that then + // we cannot free the tensor because the original address of the allocation is lost. + // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views + // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data) + AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); + node->data = parent->data; + return; + } + } + else { + AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); node->data = parent->data; - return; } + return; } - else { - AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); - node->data = parent->data; - } - return; } } - ggml_allocator_alloc_tensor(alloc, node); + ggml_allocr_alloc(alloc, node); } } } static size_t ggml_allocator_alloc_graph_tensors_n( - struct ggml_allocator * alloc, + struct ggml_allocr * alloc, struct ggml_cgraph ** graphs, int n_graphs, struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) { @@ -455,7 +457,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n( for (int g = 0; g < n_graphs; g++) { struct ggml_cgraph * gf = graphs[g]; AT_PRINTF("####### graph %d/%d\n", g, n_graphs); - // graph inputs are allocated first to ensure that they are never overwritten + // graph inputs are allocated first to ensure that they are not overwritten by each other if (inputs != NULL && inputs[g] != NULL) { for (int i = 0; inputs[g][i] != NULL; i++) { struct ggml_tensor * input = inputs[g][i]; @@ -534,6 +536,6 @@ static size_t ggml_allocator_alloc_graph_tensors_n( return alloc->max_size; } -size_t ggml_allocator_alloc_graph_tensors(struct ggml_allocator * alloc, struct ggml_cgraph * graph) { +size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) { return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL); } diff --git a/ggml-alloc.h b/ggml-alloc.h index 716d74642b156..a5ec8f87a9453 100644 --- a/ggml-alloc.h +++ b/ggml-alloc.h @@ -7,13 +7,14 @@ extern "C" { #endif -GGML_API struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alignment); -GGML_API struct ggml_allocator * ggml_allocator_new_measure(size_t alignment); -GGML_API void ggml_allocator_free(struct ggml_allocator * alloc); -GGML_API bool ggml_allocator_is_measure(struct ggml_allocator * alloc); -GGML_API void ggml_allocator_reset(struct ggml_allocator * alloc); -GGML_API void ggml_allocator_alloc_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor); -GGML_API size_t ggml_allocator_alloc_graph_tensors(struct ggml_allocator * alloc, struct ggml_cgraph * graph); +GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment); +GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment); + +GGML_API void ggml_allocr_free(struct ggml_allocr * alloc); +GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc); +GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc); +GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor); +GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph); #ifdef __cplusplus diff --git a/llama.cpp b/llama.cpp index 6c57a2be9da38..e3e10fb7386d5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -344,7 +344,7 @@ struct llama_context { #endif #ifdef LLAMA_USE_ALLOCATOR if (alloc) { - ggml_allocator_free(alloc); + ggml_allocr_free(alloc); } #endif } @@ -389,7 +389,7 @@ struct llama_context { #ifdef LLAMA_USE_ALLOCATOR llama_ctx_buffer buf_alloc; - ggml_allocator * alloc = NULL; + ggml_allocr * alloc = NULL; #endif #ifdef LLAMA_USE_SCRATCH @@ -1431,10 +1431,6 @@ static struct ggml_cgraph * llama_build_graph( }; #ifdef LLAMA_USE_ALLOCATOR -# define ggml_rope_custom_inplace ggml_rope_custom -# define ggml_scale_inplace ggml_scale -# define ggml_diag_mask_inf_inplace ggml_diag_mask_inf -# define ggml_soft_max_inplace ggml_soft_max params.no_alloc = true; #endif @@ -1449,8 +1445,8 @@ static struct ggml_cgraph * llama_build_graph( struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); #ifdef LLAMA_USE_ALLOCATOR - ggml_allocator_alloc_tensor(lctx.alloc, inp_tokens); - if (!ggml_allocator_is_measure(lctx.alloc)) { + ggml_allocr_alloc(lctx.alloc, inp_tokens); + if (!ggml_allocr_is_measure(lctx.alloc)) { memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); } #else @@ -1467,8 +1463,8 @@ static struct ggml_cgraph * llama_build_graph( inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); #ifdef LLAMA_USE_ALLOCATOR - ggml_allocator_alloc_tensor(lctx.alloc, inpL); - if (!ggml_allocator_is_measure(lctx.alloc)) { + ggml_allocr_alloc(lctx.alloc, inpL); + if (!ggml_allocr_is_measure(lctx.alloc)) { memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); } #else @@ -1502,8 +1498,8 @@ static struct ggml_cgraph * llama_build_graph( struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); #ifdef LLAMA_USE_ALLOCATOR - ggml_allocator_alloc_tensor(lctx.alloc, KQ_scale); - if (!ggml_allocator_is_measure(lctx.alloc)) { + ggml_allocr_alloc(lctx.alloc, KQ_scale); + if (!ggml_allocr_is_measure(lctx.alloc)) { ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); } #else @@ -1760,13 +1756,6 @@ static struct ggml_cgraph * llama_build_graph( ggml_free(ctx0); return gf; - -#ifdef LLAMA_USE_ALLOCATOR -# undef ggml_rope_custom -# undef ggml_scale -# undef ggml_diag_mask_inf -# undef ggml_soft_max -#endif } // evaluate the transformer @@ -1808,13 +1797,13 @@ static bool llama_eval_internal( const int64_t n_vocab = hparams.n_vocab; #ifdef LLAMA_USE_ALLOCATOR - ggml_allocator_reset(lctx.alloc); + ggml_allocr_reset(lctx.alloc); #endif ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past); #ifdef LLAMA_USE_ALLOCATOR - ggml_allocator_alloc_graph_tensors(lctx.alloc, gf); + ggml_allocr_alloc_graph(lctx.alloc, gf); #endif // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); @@ -3282,7 +3271,7 @@ struct llama_context * llama_new_context_with_model( ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); // create measure allocator - ctx->alloc = ggml_allocator_new_measure(tensor_alignment); + ctx->alloc = ggml_allocr_new_measure(tensor_alignment); // build worst-case graph int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); @@ -3291,7 +3280,7 @@ struct llama_context * llama_new_context_with_model( ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); // measure memory requirements for the graph - size_t alloc_size = ggml_allocator_alloc_graph_tensors(ctx->alloc, gf) + tensor_alignment; + size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); @@ -3303,10 +3292,10 @@ struct llama_context * llama_new_context_with_model( //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0); // recreate allocator with exact memory requirements - ggml_allocator_free(ctx->alloc); + ggml_allocr_free(ctx->alloc); ctx->buf_alloc.resize(alloc_size); - ctx->alloc = ggml_allocator_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment); + ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment); } #else ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());