Skip to content

Commit f93a2bb

Browse files
committed
abetlen#717: Add support for Huggingface Autotokenizer
1 parent b76724c commit f93a2bb

File tree

1 file changed

+21
-0
lines changed

1 file changed

+21
-0
lines changed

llama_cpp/llama_chat_format.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,3 +320,24 @@ def format_chatml(
320320
_messages.append((_roles["assistant"], None))
321321
_prompt = _format_chatml(system_message, _messages, _sep)
322322
return ChatFormatterResponse(prompt=_prompt)
323+
324+
# eg, export HF_MODEL=mistralai/Mistral-7B-Instruct-v0.1
325+
@register_chat_format("autotokenizer")
326+
def format_autotokenizer(
327+
messages: List[llama_types.ChatCompletionRequestMessage],
328+
**kwargs: Any,
329+
) -> ChatFormatterResponse:
330+
# https://huggingface.co/docs/transformers/main/chat_templating
331+
# https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format
332+
# https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/blob/main/tokenizer_config.json
333+
import os
334+
from transformers import AutoTokenizer
335+
huggingFaceModel = os.getenv("HF_MODEL") # eg, mistralai/Mistral-7B-Instruct-v0.1
336+
print(huggingFaceModel)
337+
if not huggingFaceModel:
338+
raise Exception("HF_MODEL needs to be set in env to use chat format 'autotokenizer'")
339+
tokenizer = AutoTokenizer.from_pretrained(huggingFaceModel)
340+
tokenizer.use_default_system_prompt = False
341+
_prompt = tokenizer.apply_chat_template(messages, tokenize=False)
342+
# Return formatted prompt and eos token by default
343+
return ChatFormatterResponse(prompt=_prompt, stop=tokenizer.eos_token)

0 commit comments

Comments
 (0)