Skip to content

llama : sync gguf-llama with llama #2613

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Aug 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 6 additions & 14 deletions convert-llama-h5-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def count_model_parts(dir_model: str) -> int:

gguf_writer.add_architecture(llm_arch)
gguf_writer.add_name(last_dir)
gguf_writer.add_file_type( "All tensors F32" if ftype == 0 else "Most tensors F16, some F32")
gguf_writer.add_file_type("All tensors F32" if ftype == 0 else "Most tensors F16, some F32")
gguf_writer.add_source_hf_repo(hf_repo)
gguf_writer.add_context_length(llm_arch, hparams["max_position_embeddings"])
gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"])
Expand All @@ -122,19 +122,11 @@ def count_model_parts(dir_model: str) -> int:

for i in range(tokenizer.vocab_size()):
text: bytes
if tokenizer.is_unknown(i):
text = " \u2047 ".encode("utf-8")
elif tokenizer.is_control(i):
text = b""
if tokenizer.is_byte(i):
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
raise Exception(f"Invalid token: {piece}")
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
score: float = tokenizer.get_score(i)
score: float

piece = tokenizer.id_to_piece(i)
text = piece.encode("utf-8")
score = tokenizer.get_score(i)

tokens.append(text)
scores.append(score)
Expand Down
252 changes: 126 additions & 126 deletions examples/gguf/gguf-llama-simple.cpp
Original file line number Diff line number Diff line change
@@ -1,126 +1,126 @@
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "common.h"
#include "gguf-llama.h"
#include "build-info.h"
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
int main(int argc, char ** argv) {
gpt_params params;
if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
return 1 ;
}
if (argc >= 2) {
params.model = argv[1];
}
if (argc >= 3) {
params.prompt = argv[2];
}
if (params.prompt.empty()) {
params.prompt = "Hello my name is";
}
// init LLM
llama_backend_init(params.numa);
llama_context_params ctx_params = llama_context_default_params();
llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return 1;
}
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
// tokenize the prompt
std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
const int max_context_size = llama_n_ctx(ctx);
const int max_tokens_list_size = max_context_size - 4;
if ((int)tokens_list.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
return 1;
}
fprintf(stderr, "\n\n");
for (auto id : tokens_list) {
fprintf(stderr, "%s", llama_token_to_str(ctx, id));
}
fflush(stderr);
// main loop
// The LLM keeps a contextual cache memory of previous token evaluation.
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
while (llama_get_kv_cache_token_count(ctx) < max_context_size) {
// evaluate the transformer
if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
tokens_list.clear();
// sample the next token
llama_token new_token_id = 0;
auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(ctx);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
// is it an end of stream ?
if (new_token_id == llama_token_eos()) {
fprintf(stderr, " [end of text]\n");
break;
}
// print the new token :
printf("%s", llama_token_to_str(ctx, new_token_id));
fflush(stdout);
// push this new token for next evaluation
tokens_list.push_back(new_token_id);
}
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
return 0;
}
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include "common.h"
#include "gguf-llama.h"
#include "build-info.h"

#include <cmath>
#include <cstdio>
#include <string>
#include <vector>

int main(int argc, char ** argv) {
gpt_params params;

if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
return 1 ;
}

if (argc >= 2) {
params.model = argv[1];
}

if (argc >= 3) {
params.prompt = argv[2];
}

if (params.prompt.empty()) {
params.prompt = "Hello my name is";
}

// init LLM

llama_backend_init(params.numa);

llama_context_params ctx_params = llama_context_default_params();

llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);

if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return 1;
}

llama_context * ctx = llama_new_context_with_model(model, ctx_params);

// tokenize the prompt

std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(ctx, params.prompt, true);

const int max_context_size = llama_n_ctx(ctx);
const int max_tokens_list_size = max_context_size - 4;

if ((int) tokens_list.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
return 1;
}

fprintf(stderr, "\n\n");

for (auto id : tokens_list) {
fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
}

fflush(stderr);

// main loop

// The LLM keeps a contextual cache memory of previous token evaluation.
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
// example, we will just stop the loop once this cache is full or once an end of stream is detected.

while (llama_get_kv_cache_token_count(ctx) < max_context_size) {
// evaluate the transformer

if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}

tokens_list.clear();

// sample the next token

llama_token new_token_id = 0;

auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(ctx);

std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);

for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
}

llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

new_token_id = llama_sample_token_greedy(ctx , &candidates_p);

// is it an end of stream ?
if (new_token_id == llama_token_eos()) {
fprintf(stderr, " [end of text]\n");
break;
}

// print the new token :
printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
fflush(stdout);

// push this new token for next evaluation
tokens_list.push_back(new_token_id);

}

llama_free(ctx);
llama_free_model(model);

llama_backend_free();

return 0;
}
Loading