@@ -10243,9 +10243,6 @@ struct llm_build_context {
10243
10243
cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls, inp), model.cls_b);
10244
10244
cur = ggml_tanh(ctx0, cur);
10245
10245
cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls_out, cur), model.cls_out_b);
10246
-
10247
- // broadcast across the embedding size to make it compatible with the llama_get_embeddings API
10248
- cur = ggml_repeat(ctx0, cur, inp);
10249
10246
} break;
10250
10247
default:
10251
10248
{
@@ -16997,7 +16994,6 @@ static int llama_decode_internal(
16997
16994
case LLAMA_POOLING_TYPE_MEAN:
16998
16995
case LLAMA_POOLING_TYPE_CLS:
16999
16996
case LLAMA_POOLING_TYPE_LAST:
17000
- case LLAMA_POOLING_TYPE_RANK:
17001
16997
{
17002
16998
// extract sequence embeddings (cleared before processing each batch)
17003
16999
auto & embd_seq_out = lctx.embd_seq;
@@ -17011,6 +17007,20 @@ static int llama_decode_internal(
17011
17007
ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
17012
17008
}
17013
17009
} break;
17010
+ case LLAMA_POOLING_TYPE_RANK:
17011
+ {
17012
+ // extract the rank score - a single float per sequence
17013
+ auto & embd_seq_out = lctx.embd_seq;
17014
+
17015
+ for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
17016
+ const llama_seq_id seq_id = ubatch.seq_id[s][0];
17017
+ if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
17018
+ continue;
17019
+ }
17020
+ embd_seq_out[seq_id].resize(1);
17021
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
17022
+ }
17023
+ } break;
17014
17024
case LLAMA_POOLING_TYPE_UNSPECIFIED:
17015
17025
{
17016
17026
GGML_ABORT("unknown pooling type");
0 commit comments