diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 305f79492a055..09df5a96f4759 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1487,11 +1487,101 @@ struct server_context { } } + // for multiple images processing + bool ingest_images(server_slot &slot, int n_batch) + { + int image_idx = 0; + std::string prompt = ""; + + while (image_idx < (int) slot.images.size()) + { + slot_image &img = slot.images[image_idx]; + + // process prefix prompt + for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) + { + const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); + llama_batch batch_view = { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.n_seq_id + i, + batch.seq_id + i, + batch.logits + i, + 0, 0, 0, // unused + }; + if (llama_decode(ctx, batch_view)) + { + LOG_TEE("%s : failed to eval\n", __func__); + return false; + } + } + + // process image with llm + for (int i = 0; i < img.image_tokens; i += n_batch) + { + int n_eval = img.image_tokens - i; + if (n_eval > n_batch) + { + n_eval = n_batch; + } + + const int n_embd = llama_n_embd(model); + llama_batch batch_img = { + n_eval, + nullptr, + (img.image_embedding + i * n_embd), + nullptr, + nullptr, + nullptr, + nullptr, + slot.n_past, + 1, 0 + }; + if (llama_decode(ctx, batch_img)) + { + LOG_TEE("%s : failed to eval image\n", __func__); + return false; + } + slot.n_past += n_eval; + } + image_idx++; + + llama_batch_clear(batch); + + // append prefix of next image + const auto json_prompt = (image_idx >= (int) slot.images.size()) ? + slot.params.input_suffix : // no more images, then process suffix prompt + (json)(slot.images[image_idx].prefix_prompt); + + // rebuild the prompt since it was cleared earlier + prompt += img.prefix_prompt; + prompt += "[img-" + std::to_string(img.id) + "]"; + prompt += json_prompt; + + std::vector append_tokens = tokenize(json_prompt, false); // has next image + for (int i = 0; i < (int) append_tokens.size(); ++i) + { + llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true); + slot.n_past += 1; + } + } + + // There is no prompt caching in multimodal currently + slot.n_prompt_tokens = slot.n_past; + slot.n_prompt_tokens_processed = slot.n_past; + + // prompt for multimodal is set to empty to avoid processing those tokens here + slot.prompt = prompt; + + return true; + } + void request_cancel(int id_task) { server_task task; task.type = SERVER_TASK_TYPE_CANCEL; task.id_target = id_task; - queue_tasks.post(task); }