Skip to content

WIP: Add support for CogAgent #12679

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 26 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1406,14 +1406,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.mmproj = value;
}
).set_examples({LLAMA_EXAMPLE_LLAVA}));
).set_examples({LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_COGAGENT}));
add_opt(common_arg(
{"--image"}, "FILE",
"path to an image file. use with multimodal models. Specify multiple times for batching",
[](common_params & params, const std::string & value) {
params.image.emplace_back(value);
}
).set_examples({LLAMA_EXAMPLE_LLAVA}));
).set_examples({LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_VISION, LLAMA_EXAMPLE_COGAGENT}));
if (llama_supports_rpc()) {
add_opt(common_arg(
{"--rpc"}, "SERVERS",
Expand Down
2 changes: 2 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ enum llama_example {
LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS,
LLAMA_EXAMPLE_VISION,
LLAMA_EXAMPLE_COGAGENT,

LLAMA_EXAMPLE_COUNT,
};
Expand Down
383 changes: 371 additions & 12 deletions convert_hf_to_gguf.py

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ if (EMSCRIPTEN)
else()
add_subdirectory(batched-bench)
add_subdirectory(batched)
add_subdirectory(cogagent)
add_subdirectory(embedding)
add_subdirectory(eval-callback)

Expand Down Expand Up @@ -53,6 +54,7 @@ else()
add_subdirectory(tokenize)
add_subdirectory(tts)
add_subdirectory(gen-docs)
add_subdirectory(vision)
if (NOT GGML_BACKEND_DL)
# these examples use the backends directly and cannot be built with dynamic loading
add_subdirectory(convert-llama2c-to-ggml)
Expand Down
18 changes: 18 additions & 0 deletions examples/cogagent/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
set(TARGET llama-cogagent-cli)
add_executable(${TARGET} cogagent-cli.cpp)
add_library(cogagent OBJECT
vision_encoder.cpp
vision_encoder.h
cross_vision.cpp
cross_vision.h
cogagent_util.cpp
cogagent_util.h
image_util.cpp
image_util.h)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-cogagent-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common cogagent ggml ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(cogagent PUBLIC ../../ggml/include)
target_include_directories(cogagent PUBLIC ../../include)
target_include_directories(cogagent PUBLIC ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
285 changes: 285 additions & 0 deletions examples/cogagent/cogagent-cli.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
#include "arg.h"
#include "base64.hpp"
#include "log.h"
#include "common.h"
#include "sampling.h"
#include "llama.h"

#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>

#include "cogagent.h"

cogagent_ctx cogagent_global;

// This function is mostly copied from cogagent cli
static bool eval_string_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
int N = (int) tokens.size();

//// Processing the input tokens in batches
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}

std::vector<int> pos;
pos.resize(n_eval);
for (int i=0; i<n_eval; i++) {
pos[i] = *n_past + i;
}

llama_batch batch = llama_batch_get_one(&tokens[i], n_eval);
batch.cross_embd = cogagent_global.cross_vision_image_tensor;
batch.pos = pos.data();
if (llama_decode(ctx_llama, batch)) {
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
*n_past += n_eval;
}
return true;
}

static bool eval_image_tokens(llama_context * ctx_llama, std::vector<float> &img_data,
int n_batch, int * n_past) {
int n_embd = 4096;
int num_tokens = 258;
int positions[258];

positions[0] = *n_past;
for (int i=0; i<num_tokens-2; i++) {
positions[i + 1] = *n_past + 1;
}
positions[num_tokens - 1] = *n_past + 2;

float * data_ptr = img_data.data();

for (int i = 0; i < num_tokens; i += n_batch) {
int n_eval = num_tokens - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
llama_batch batch = {int32_t(n_eval), nullptr, data_ptr, positions, nullptr, nullptr, nullptr, nullptr, nullptr, };
batch.cross_embd = cogagent_global.cross_vision_image_tensor;
if (llama_decode(ctx_llama, batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
data_ptr += i * n_embd;
}
*n_past += 3;
return true;
}

static void print_usage(int, char ** argv) {
LOG("\n example usage:\n");
LOG("\n %s -m <cogagent-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <cogagent-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
}

static const char * sample(struct common_sampler * smpl,
struct llama_context * ctx_llama,
int * n_past) {
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
common_sampler_accept(smpl, id, true);

const llama_model * model = llama_get_model(ctx_llama);
const llama_vocab * vocab = llama_model_get_vocab(model);

static std::string ret;
if (llama_vocab_is_eog(vocab, id)) {
ret = "</s>";
} else {
ret = common_token_to_piece(ctx_llama, id);
}
// Give the new token to the model. I'm not sure how it is stored.
// Perhaps it is stored in the KV cache.
std::vector<llama_token> tokens;
tokens.push_back(id);
eval_string_tokens(ctx_llama, tokens, 1, n_past);

return ret.c_str();
}

static bool run_vision_encoders(const char* vision_encoder_path, const char* image_path) {
// Load image and resize for the encoders
std::vector<float> small_image_data; // For vision encoder
std::vector<float> large_image_data; // For cross vision encoder
if (!load_and_stretch_image(image_path, cogagent_global.vision_encoder_img_size,
small_image_data, cogagent_global.norm_mean, cogagent_global.norm_deviation)) {
printf("Failed to load the specified image file.\n");
return false;
}
if (!load_and_stretch_image(image_path, cogagent_global.cross_vision_img_size,
large_image_data, cogagent_global.norm_mean, cogagent_global.norm_deviation)) {
printf("Failed to load the specified image file.\n");
return false;
}

// For debugging purposes
const char * vision_encoder_resized_image = "cogagent_encoders/llama_vision_encoder_input.gguf";
int dims[3] = {cogagent_global.vision_encoder_img_size,
cogagent_global.vision_encoder_img_size, 3};
save_tensor_from_data(small_image_data, dims, vision_encoder_resized_image);
const char * cross_vision_resized_image = "cogagent_encoders/llama_cross_vision_input.gguf";
dims[0] = cogagent_global.cross_vision_img_size;
dims[1] = cogagent_global.cross_vision_img_size;
save_tensor_from_data(large_image_data, dims, cross_vision_resized_image);

// const char * reference_vision_encoder_input = "/home/tianyue/myworkspace"
// "/vlm_intermediate/vision_encoder_input.gguf";
// const char * reference_cross_vision_input = "/home/tianyue/myworkspace"
// "/vlm_intermediate/cross_vision_input.gguf";
// // Load the reference input
// if (get_input(small_image_data, reference_vision_encoder_input) < 0) {
// printf("Failed to load small image input\n");
// return false;
// }
// if (get_input(large_image_data, reference_cross_vision_input) < 0) {
// printf("Failed to load big image input\n");
// return false;
// }
printf("Loaded and resized the specified image.\n");

// Load the vision encoder weights
if (!vision_encoder_init_load(vision_encoder_path)) {
printf("Failed to load vision encoder model file.\n");
return false;
}
printf("Vision encoder weights loaded.\n");

// Run the vision encoder
run_vision_encoder(small_image_data);
printf("Completed vision encoder run on image file.\n");

free_vision_encoder_ctx();

// Load and run the cross vision encoder
if (!cross_vision_init_load(vision_encoder_path)) {
printf("Failed to load cross vision encoder model file.\n");
return false;
}
printf("Cross vision encoder weights loaded.\n");

run_cross_vision(large_image_data);
printf("Completed cross vision encoder run on image file.\n");

free_cross_vision_ctx();
return true;
}

int main(int argc, char ** argv) {
ggml_time_init();
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COGAGENT, print_usage)) {
return 1;
}
common_init();

llama_backend_init();
llama_numa_init(params.numa);

// Initialize a GGML context to store the encoded image tensors
struct ggml_init_params token_ctx_params = {
size_t(40000000),
NULL,
false,
};
cogagent_global.token_ctx = ggml_init(token_ctx_params);
if (!cogagent_global.token_ctx) {
printf("Failed to initialize token storage context.\n");
return 1;
}
// Allocate the tensor for cross vision encoded image
cogagent_global.cross_vision_image_tensor = ggml_new_tensor_2d(
cogagent_global.token_ctx, GGML_TYPE_F32, 1024, 6400
);

// Load the images and the encoder models
// Then run the encoder models
if (!run_vision_encoders(params.mmproj.c_str(), params.image[0].c_str())) {
return 1;
}

llama_model_params model_params = common_model_params_to_llama(params);
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
if (model == nullptr) {
printf("Failed to load decoder model\n");
return 1;
}

llama_context_params ctx_params = common_context_params_to_llama(params);
printf("Context size is %d tokens\n", ctx_params.n_ctx);
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);

if (ctx_llama == nullptr) {
printf("Failed to create the llama context\n");
return 1;
}

cogagent_global.ctx_llama = ctx_llama;
cogagent_global.cogvlm_model = model;

// At the moment I can't figure out how the llama kv cache
// keeps its information across runs.
// It seems to me that the graph is allocated for each batch,
// which would invalidate any tensors stored in the kv cache.
// I don't spot logic for separately allocating the kv cache
// tensors to avoid this, so it doesn't make sense.
// Maybe the graph isn't actually allocated for each batch?
// Perhaps that is why a worst case graph is allocated.

// TODO: Check if system prompt is compatible
std::vector<llama_token> begin_token;
const llama_vocab * vocab = llama_model_get_vocab(cogagent_global.cogvlm_model);
begin_token.push_back(llama_vocab_bos(vocab));

int n_past = 0;
printf("Run model with bos token.\n");
eval_string_tokens(cogagent_global.ctx_llama,
begin_token, params.n_batch, &n_past);
printf("Run model with image tokens.\n");
eval_image_tokens(cogagent_global.ctx_llama, cogagent_global.vision_encoder_image,
params.n_batch, &n_past);
// Tokenize user prompt
// Third option set to false to that the tokenizer doesn't add
// beginning of sentence and end of sentence
std::vector<llama_token> user_prompt_tokens = common_tokenize(
cogagent_global.ctx_llama, params.prompt, false, true
);
printf("Run model with user entered text tokens.\n");
eval_string_tokens(cogagent_global.ctx_llama, user_prompt_tokens,
params.n_batch, &n_past);

printf("Parsed maximum sampling length %d.\n", params.n_predict);
int max_len = params.n_predict < 0 ? 256 : params.n_predict;

struct common_sampler * smpl = common_sampler_init(cogagent_global.cogvlm_model, params.sampling);
if (!smpl) {
printf("Failed to initialize sampler.\n");
return 1;
}
printf("\nReprinting entered prompt.\n %s \n", params.prompt.c_str());
printf("\n\n Beginning of response.\n");
std::string response = "";
for (int i=0; i<max_len; ++i) {
const char * tmp = sample(smpl, cogagent_global.ctx_llama, &n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0) {
if (i < 10) {
continue;
}
break;
}
printf("%s", tmp);
fflush(stdout);
}
common_sampler_free(smpl);

llama_model_free(model);
ggml_free(cogagent_global.token_ctx);
return 0;
}
36 changes: 36 additions & 0 deletions examples/cogagent/cogagent.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#ifndef COGAGENT_H
#define COGAGENT_H

#include "vision_encoder.h"
#include "cross_vision.h"
#include "cogagent_util.h"
#include "image_util.h"
#include "ggml.h"
#include "gguf.h"

struct cogagent_ctx {
// Vision encoder and cross vision encoder models
vision_encoder_ctx vision_encoder;
cross_vision_ctx cross_vision;

struct llama_context * ctx_llama;
struct llama_model * cogvlm_model;

// Context for storing vision tokens and cross vision
// embedded picture tensor
ggml_context * token_ctx;

std::string user_prompt;
std::vector<float> vision_encoder_image; // Image encoded by the vision encoder
struct ggml_tensor * cross_vision_image_tensor; // Image encoded by the cross vision encoder

int vision_encoder_img_size = 224;
int cross_vision_img_size = 1120;

float norm_mean[3] = {0.48145466, 0.4578275, 0.40821073};
float norm_deviation[3] = {0.26862954, 0.26130258, 0.27577711};
};

extern struct cogagent_ctx cogagent_global;

#endif
Loading