Skip to content

persimmon : simplify rope logic #4302

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 18 additions & 83 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3029,13 +3029,6 @@ static void llm_load_tensors(
ggml_backend_type backend_output;

if (n_gpu_layers > int(n_layer)) {
#ifdef GGML_USE_CUBLAS
if (n_gpu_layers > int(n_layer + 1)) {
LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
__func__, n_layer + 1);
throw std::runtime_error("Persimmon CUDA offload failed");
}
#endif
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32
Expand Down Expand Up @@ -4348,105 +4341,47 @@ struct llm_build_context {
struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
cb(tmpqkv_perm, "tmpqkv", il);

struct ggml_tensor * tmpq = ggml_view_3d(
struct ggml_tensor * Qcur = ggml_view_3d(
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
ggml_element_size(tmpqkv_perm) * n_embd_head,
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
0
);
cb(tmpq, "tmpq", il);
cb(Qcur, "Qcur", il);

struct ggml_tensor * tmpk = ggml_view_3d(
struct ggml_tensor * Kcur = ggml_view_3d(
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
ggml_element_size(tmpqkv_perm) * n_embd_head,
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
);
cb(tmpk, "tmpk", il);
cb(Kcur, "Kcur", il);

// Q/K Layernorm
tmpq = llm_build_norm(ctx0, tmpq, hparams,
Qcur = llm_build_norm(ctx0, Qcur, hparams,
model.layers[il].attn_q_norm,
model.layers[il].attn_q_norm_b,
LLM_NORM, cb, il);
cb(tmpq, "tmpq", il);
cb(Qcur, "Qcur", il);

tmpk = llm_build_norm(ctx0, tmpk, hparams,
Kcur = llm_build_norm(ctx0, Kcur, hparams,
model.layers[il].attn_k_norm,
model.layers[il].attn_k_norm_b,
LLM_NORM, cb, il);
cb(tmpk, "tmpk", il);

// RoPE the first n_rot of q/k, pass the other half, and concat.
struct ggml_tensor * qrot = ggml_view_3d(
ctx0, tmpq, n_rot, n_head, n_tokens,
ggml_element_size(tmpq) * n_embd_head,
ggml_element_size(tmpq) * n_embd_head * n_head,
0
);
cb(qrot, "qrot", il);

struct ggml_tensor * krot = ggml_view_3d(
ctx0, tmpk, n_rot, n_head, n_tokens,
ggml_element_size(tmpk) * n_embd_head,
ggml_element_size(tmpk) * n_embd_head * n_head,
0
);
cb(krot, "krot", il);

// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
struct ggml_tensor * qpass = ggml_view_3d(
ctx0, tmpq, n_rot, n_head, n_tokens,
ggml_element_size(tmpq) * n_embd_head,
ggml_element_size(tmpq) * n_embd_head * n_head,
ggml_element_size(tmpq) * n_rot
);
cb(qpass, "qpass", il);

struct ggml_tensor * kpass = ggml_view_3d(
ctx0, tmpk, n_rot, n_head, n_tokens,
ggml_element_size(tmpk) * n_embd_head,
ggml_element_size(tmpk) * n_embd_head * n_head,
ggml_element_size(tmpk) * n_rot
);
cb(kpass, "kpass", il);

struct ggml_tensor * qrotated = ggml_rope_custom(
ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
);
cb(qrotated, "qrotated", il);
cb(Kcur, "Kcur", il);

struct ggml_tensor * krotated = ggml_rope_custom(
ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
Qcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(krotated, "krotated", il);

// ggml currently only supports concatenation on dim=2
// so we need to permute qrot, qpass, concat, then permute back.
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
cb(qrotated, "qrotated", il);

krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
cb(krotated, "krotated", il);

qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
cb(qpass, "qpass", il);

kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
cb(kpass, "kpass", il);

struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
cb(Qcur, "Qcur", il);

struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
cb(Kcur, "Kcur", il);

struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
cb(Q, "Q", il);

Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
Kcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head, n_tokens), inp_pos,
n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);

struct ggml_tensor * Vcur = ggml_view_3d(
Expand All @@ -4462,7 +4397,7 @@ struct llm_build_context {
// TODO: not tested, could be broken
cur = llm_build_kqv(ctx0, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo,
Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
cb(cur, "kqv_out", il);
}

Expand Down