Skip to content

Commit 044ec4b

Browse files
committed
embedding : add EOS token if not present (#899)
1 parent 77178ee commit 044ec4b

File tree

1 file changed

+9
-2
lines changed

1 file changed

+9
-2
lines changed

examples/embedding/embedding.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,13 +112,20 @@ int main(int argc, char ** argv) {
112112
// tokenize the prompts and trim
113113
std::vector<std::vector<int32_t>> inputs;
114114
for (const auto & prompt : prompts) {
115-
auto inp = ::llama_tokenize(ctx, prompt, true);
115+
auto inp = ::llama_tokenize(ctx, prompt, true, false);
116116
if (inp.size() > n_batch) {
117117
inp.resize(n_batch);
118118
}
119119
inputs.push_back(inp);
120120
}
121121

122+
// add eos if not present
123+
for (auto & inp : inputs) {
124+
if (inp.empty() || inp.back() != llama_token_eos(model)) {
125+
inp.push_back(llama_token_eos(model));
126+
}
127+
}
128+
122129
// tokenization stats
123130
if (params.verbose_prompt) {
124131
for (int i = 0; i < (int) inputs.size(); i++) {
@@ -172,7 +179,7 @@ int main(int argc, char ** argv) {
172179
for (int j = 0; j < n_prompts; j++) {
173180
fprintf(stdout, "embedding %d: ", j);
174181
for (int i = 0; i < std::min(16, n_embd); i++) {
175-
fprintf(stdout, "%f ", emb[j * n_embd + i]);
182+
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
176183
}
177184
fprintf(stdout, "\n");
178185
}

0 commit comments

Comments
 (0)