abetlen
diff --git a/‎CMakeLists.txt
Lines changed: 19 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 19 additions & 0 deletions
diff --git a/‎docs/server.md
Lines changed: 77 additions & 0 deletions b/‎docs/server.md
Lines changed: 77 additions & 0 deletions
diff --git a/‎examples/notebooks/Multimodal.ipynb
Lines changed: 84 additions & 0 deletions b/‎examples/notebooks/Multimodal.ipynb
Lines changed: 84 additions & 0 deletions
diff --git a/‎llama_cpp/llama.py
Lines changed: 29 additions & 14 deletions b/‎llama_cpp/llama.py
Lines changed: 29 additions & 14 deletions
@@ -41,4 +41,23 @@ if (LLAMA_BUILD)
         FILES $<TARGET_RUNTIME_DLLS:llama>
         DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
     )
+    add_subdirectory(vendor/llama.cpp/examples/llava)
+    set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
+    install(
+        TARGETS llava_shared
+        LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+        RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+        ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+        FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+        RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+    )
+    # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
+    install(
+        TARGETS llava_shared
+        LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+        RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+        ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+        FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+        RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+    )
 endif()
@@ -0,0 +1,77 @@
+# OpenAI Compatible Server
+
+`llama-cpp-python` offers an OpenAI API compatible web server.
+
+This web server can be used to serve local models and easily connect them to existing clients.
+
+## Setup
+
+### Installation
+
+The server can be installed by running the following command:
+
+```bash
+pip install llama-cpp-python[server]
+```
+
+### Running the server
+
+The server can then be started by running the following command:
+
+```bash
+python3 -m llama_cpp.server --model <model_path>
+```
+
+### Server options
+
+For a full list of options, run:
+
+```bash
+python3 -m llama_cpp.server --help
+```
+
+NOTE: All server options are also available as environment variables. For example, `--model` can be set by setting the `MODEL` environment variable.
+
+## Guides
+
+### Multi-modal Models
+
+`llama-cpp-python` supports the llava1.5 family of multi-modal models which allow the language model to
+read information from both text and images.
+
+You'll first need to download one of the available multi-modal models in GGUF format:
+
+- [llava1.5 7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
+- [llava1.5 13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
+
+Then when you run the server you'll need to also specify the path to the clip model used for image embedding
+
+```bash
+python3 -m llama_cpp.server --model <model_path> --clip-model-path <clip_model_path>
+```
+
+Then you can just use the OpenAI API as normal
+
+```python3
+from openai import OpenAI
+
+client = OpenAI(base_url="http://<host>:<port>/v1", api_key="sk-xxx")
+response = client.chat.completions.create(
+    model="gpt-4-vision-preview",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "<image_url>"
+                    },
+                },
+                {"type": "text", "text": "What does the image say"},
+            ],
+        }
+    ],
+)
+print(response)
+```
@@ -0,0 +1,84 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ChatCompletion(id='chatcmpl-65a710ba-41d1-4d0a-a124-a44b2b4a0189', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content=' The image reads \"LlamaC++.\"', role='assistant', function_call=None, tool_calls=None))], created=1699413274, model='gpt-4-vision-preview', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=10, prompt_tokens=624, total_tokens=634))\n"
+     ]
+    }
+   ],
+   "source": [
+    "from openai import OpenAI\n",
+    "\n",
+    "import urllib.request\n",
+    "import base64\n",
+    "\n",
+    "def get_data_url(url):\n",
+    "    return \"data:image/png;base64,\" + base64.b64encode(urllib.request.urlopen(url).read()).decode(\"utf-8\")\n",
+    "\n",
+    "client = OpenAI(base_url=\"http://100.64.159.73:8000/v1\", api_key=\"sk-1234\")\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"gpt-4-vision-preview\",\n",
+    "    messages=[\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": [\n",
+    "                {\n",
+    "                    \"type\": \"image_url\",\n",
+    "                    \"image_url\": {\n",
+    "                        \"url\": get_data_url(\"https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png\"),\n",
+    "                        # \"url\": \"https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png\",\n",
+    "                    },\n",
+    "                },\n",
+    "                {\"type\": \"text\", \"text\": \"What does the image say\"},\n",
+    "            ],\n",
+    "        }\n",
+    "    ],\n",
+    ")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5+"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -21,9 +21,9 @@
 import diskcache
 import ctypes
 
-from . import llama_cpp
 from .llama_types import *
 from .llama_grammar import LlamaGrammar
+import llama_cpp.llama_cpp as llama_cpp
 import llama_cpp.llama_chat_format as llama_chat_format
 
 import numpy as np
@@ -752,6 +752,7 @@ def __init__(
         numa: bool = False,
         # Chat Format Params
         chat_format: str = "llama-2",
+        chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
         # Misc
         verbose: bool = True,
         # Extra Params
@@ -784,6 +785,7 @@ def __init__(
             lora_path: Path to a LoRA file to apply to the model.
             numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
             chat_format: String specifying the chat format to use when calling create_chat_completion.
+            chat_handler: Optional chat handler to use when calling create_chat_completion.
             verbose: Print verbose output to stderr.
 
         Raises:
@@ -910,6 +912,7 @@ def __init__(
             print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
 
         self.chat_format = chat_format
+        self.chat_handler = chat_handler
 
         self._n_vocab = self.n_vocab()
         self._n_ctx = self.n_ctx()
@@ -1231,7 +1234,7 @@ def create_embedding(
         else:
             inputs = input
 
-        data: List[EmbeddingData] = []
+        data: List[Embedding] = []
         total_tokens = 0
         for index, input in enumerate(inputs):
             tokens = self.tokenize(input.encode("utf-8"), special=True)
@@ -1276,7 +1279,7 @@ def embed(self, input: str) -> List[float]:
 
     def _create_completion(
         self,
-        prompt: str,
+        prompt: Union[str, List[int]],
         suffix: Optional[str] = None,
         max_tokens: int = 16,
         temperature: float = 0.8,
@@ -1297,7 +1300,9 @@ def _create_completion(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
-    ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
+    ) -> Union[
+        Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]
+    ]:
         assert self._ctx is not None
         assert suffix is None or suffix.__class__ is str
 
@@ -1309,7 +1314,7 @@ def _create_completion(
             self.tokenize(prompt.encode("utf-8"), special=True)
             if prompt != ""
             else [self.token_bos()]
-        )
+        ) if isinstance(prompt, str) else prompt
         text: bytes = b""
         returned_tokens: int = 0
         stop = (
@@ -1322,7 +1327,7 @@ def _create_completion(
 
         if len(prompt_tokens) >= self._n_ctx:
             raise ValueError(
-                f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self._ctx)}"
+                f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
             )
 
         if max_tokens <= 0:
@@ -1732,7 +1737,7 @@ def _create_completion(
 
     def create_completion(
         self,
-        prompt: str,
+        prompt: Union[str, List[int]],
         suffix: Optional[str] = None,
         max_tokens: int = 128,
         temperature: float = 0.8,
@@ -1753,7 +1758,7 @@ def create_completion(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
-    ) -> Union[Completion, Iterator[CompletionChunk]]:
+    ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
         """Generate text from a prompt.
 
         Args:
@@ -1800,7 +1805,7 @@ def create_completion(
             grammar=grammar,
         )
         if stream:
-            chunks: Iterator[CompletionChunk] = completion_or_chunks
+            chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks
             return chunks
         completion: Completion = next(completion_or_chunks)  # type: ignore
         return completion
@@ -1828,7 +1833,7 @@ def __call__(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
-    ) -> Union[Completion, Iterator[CompletionChunk]]:
+    ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
         """Generate text from a prompt.
 
         Args:
@@ -1879,7 +1884,9 @@ def create_chat_completion(
         self,
         messages: List[ChatCompletionRequestMessage],
         functions: Optional[List[ChatCompletionFunction]] = None,
-        function_call: Optional[Union[str, ChatCompletionFunctionCall]] = None,
+        function_call: Optional[ChatCompletionRequestFunctionCall] = None,
+        tools: Optional[List[ChatCompletionTool]] = None,
+        tool_choice: Optional[ChatCompletionToolChoiceOption] = None,
         temperature: float = 0.2,
         top_p: float = 0.95,
         top_k: int = 40,
@@ -1896,7 +1903,9 @@ def create_chat_completion(
         model: Optional[str] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
-    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+    ) -> Union[
+        CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
+    ]:
         """Generate a chat completion from a list of messages.
 
         Args:
@@ -1912,12 +1921,16 @@ def create_chat_completion(
         Returns:
             Generated chat completion or a stream of chat completion chunks.
         """
-        handler = llama_chat_format.get_chat_completion_handler(self.chat_format)
+        handler = self.chat_handler or llama_chat_format.get_chat_completion_handler(
+            self.chat_format
+        )
         return handler(
-            self,
+            llama=self,
             messages=messages,
             functions=functions,
             function_call=function_call,
+            tools=tools,
+            tool_choice=tool_choice,
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
@@ -1974,6 +1987,7 @@ def __getstate__(self):
             numa=self.numa,
             # Chat Format Params
             chat_format=self.chat_format,
+            chat_handler=self.chat_handler,
             # Misc
             verbose=self.verbose,
         )
@@ -2015,6 +2029,7 @@ def __setstate__(self, state):
             numa=state["numa"],
             # Chat Format Params
             chat_format=state["chat_format"],
+            chat_handler=state["chat_handler"],
             # Misc
             verbose=state["verbose"],
         )