Skip to content

Commit 7820364

Browse files
server : Add option to return token pieces in /tokenize endpoint (#9108)
* server : added with_pieces functionality to /tokenize endpoint * server : Add tokenize with pieces tests to server.feature * Handle case if tokenizer splits along utf8 continuation bytes * Add example of token splitting * Remove trailing ws * Fix trailing ws * Maybe fix ci * maybe this fix windows ci? --------- Co-authored-by: Xuan Son Nguyen <[email protected]>
1 parent e6b7801 commit 7820364

File tree

6 files changed

+139
-6
lines changed

6 files changed

+139
-6
lines changed

.github/workflows/server.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ jobs:
173173
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
174174
run: |
175175
cd examples/server/tests
176+
$env:PYTHONIOENCODING = ":replace"
176177
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
177178
178179
- name: Slow tests

examples/server/README.md

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -407,9 +407,44 @@ Notice that each `probs` is an array of length `n_probs`.
407407

408408
*Options:*
409409

410-
`content`: Set the text to tokenize.
410+
`content`: (Required) The text to tokenize.
411411

412-
`add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
412+
`add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
413+
414+
`with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false`
415+
416+
**Response:**
417+
418+
Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
419+
420+
421+
If `with_pieces` is `false`:
422+
```json
423+
{
424+
"tokens": [123, 456, 789]
425+
}
426+
```
427+
428+
If `with_pieces` is `true`:
429+
```json
430+
{
431+
"tokens": [
432+
{"id": 123, "piece": "Hello"},
433+
{"id": 456, "piece": " world"},
434+
{"id": 789, "piece": "!"}
435+
]
436+
}
437+
```
438+
439+
With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
440+
```json
441+
{
442+
"tokens": [
443+
{"id": 198, "piece": [195]}, // hex C3
444+
{"id": 164, "piece": [161]} // hex A1
445+
]
446+
}
447+
```
413448

414449
### POST `/detokenize`: Convert tokens to text
415450

examples/server/server.cpp

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3013,12 +3013,39 @@ int main(int argc, char ** argv) {
30133013
const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
30143014
const json body = json::parse(req.body);
30153015

3016-
std::vector<llama_token> tokens;
3016+
json tokens_response = json::array();
30173017
if (body.count("content") != 0) {
30183018
const bool add_special = json_value(body, "add_special", false);
3019-
tokens = ctx_server.tokenize(body.at("content"), add_special);
3019+
const bool with_pieces = json_value(body, "with_pieces", false);
3020+
std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
3021+
3022+
if (with_pieces) {
3023+
for (const auto& token : tokens) {
3024+
std::string piece = llama_token_to_piece(ctx_server.ctx, token);
3025+
json piece_json;
3026+
3027+
// Check if the piece is valid UTF-8
3028+
if (is_valid_utf8(piece)) {
3029+
piece_json = piece;
3030+
} else {
3031+
// If not valid UTF-8, store as array of byte values
3032+
piece_json = json::array();
3033+
for (unsigned char c : piece) {
3034+
piece_json.push_back(static_cast<int>(c));
3035+
}
3036+
}
3037+
3038+
tokens_response.push_back({
3039+
{"id", token},
3040+
{"piece", piece_json}
3041+
});
3042+
}
3043+
} else {
3044+
tokens_response = tokens;
3045+
}
30203046
}
3021-
const json data = format_tokenizer_response(tokens);
3047+
3048+
const json data = format_tokenizer_response(tokens_response);
30223049
res_ok(res, data);
30233050
};
30243051

examples/server/tests/features/server.feature

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,14 @@ Feature: llama.cpp server
105105
Given first token is removed
106106
Then tokens can be detokenized
107107

108+
Scenario: Tokenize with pieces
109+
When tokenizing with pieces:
110+
"""
111+
What is the capital of Germany?
112+
113+
"""
114+
Then tokens are given with pieces
115+
108116
Scenario: Models available
109117
Given available models
110118
Then 1 models are supported

examples/server/tests/features/steps/steps.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
14
import asyncio
25
import json
36
import os
@@ -697,6 +700,32 @@ def step_tokenize_set_add_special(context):
697700
context.tokenize_add_special = True
698701

699702

703+
@step("tokenizing with pieces")
704+
@async_run_until_complete
705+
async def step_tokenize_with_pieces(context):
706+
context.tokenized_text = context_text(context)
707+
async with aiohttp.ClientSession() as session:
708+
tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
709+
if getattr(context, "tokenize_add_special", None) is not None:
710+
tokenize_args["add_special"] = context.tokenize_add_special
711+
712+
async with session.post(
713+
f"{context.base_url}/tokenize", json=tokenize_args
714+
) as response:
715+
assert response.status == 200
716+
tokenize_json = await response.json()
717+
context.tokens_with_pieces = tokenize_json["tokens"]
718+
719+
720+
@step("tokens are given with pieces")
721+
@async_run_until_complete
722+
async def step_tokenize_with_pieces(context):
723+
# Verify that the response contains both token IDs and pieces
724+
assert all(
725+
"id" in token and "piece" in token for token in context.tokens_with_pieces
726+
)
727+
728+
700729
@step('tokenizing')
701730
@async_run_until_complete
702731
async def step_tokenize(context):

examples/server/utils.hpp

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -616,7 +616,40 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
616616
return res;
617617
}
618618

619-
static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
619+
static bool is_valid_utf8(const std::string & str) {
620+
const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
621+
const unsigned char* end = bytes + str.length();
622+
623+
while (bytes < end) {
624+
if (*bytes <= 0x7F) {
625+
// 1-byte sequence (0xxxxxxx)
626+
bytes++;
627+
} else if ((*bytes & 0xE0) == 0xC0) {
628+
// 2-byte sequence (110xxxxx 10xxxxxx)
629+
if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
630+
return false;
631+
bytes += 2;
632+
} else if ((*bytes & 0xF0) == 0xE0) {
633+
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
634+
if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
635+
return false;
636+
bytes += 3;
637+
} else if ((*bytes & 0xF8) == 0xF0) {
638+
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
639+
if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
640+
(bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
641+
return false;
642+
bytes += 4;
643+
} else {
644+
// Invalid UTF-8 lead byte
645+
return false;
646+
}
647+
}
648+
649+
return true;
650+
}
651+
652+
static json format_tokenizer_response(const json & tokens) {
620653
return json {
621654
{"tokens", tokens}
622655
};

0 commit comments

Comments
 (0)