Merge commit 'f2901d840e15ce2770eac300172471aa029c5fd5' into main

abetlen · abetlen · commit 3cb696c044ce · 2023-11-14T15:15:17.000-05:00
diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
@@ -33,6 +33,9 @@ jobs:
 
       - name: Build wheels
         run: python -m cibuildwheel --output-dir wheelhouse
+        env:
+          # disable repair
+          CIBW_REPAIR_WHEEL_COMMAND: ""
 
       - uses: actions/upload-artifact@v3
         with:
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.18]
+
+- Update llama.cpp to ggerganov/llama.cpp@6bb4908a17150b49373b5f977685b2e180a04f6f
+
+## [0.2.17]
+
+- Update llama.cpp to ggerganov/llama.cpp@df9d1293defe783f42bc83af732d3c670552c541
+- Hotfix: Set `CUDA_ARCHITECTURES=OFF` for `llava_shared` target on Windows by @abetlen in 4388f3341413110217b98c4f097ac5c590bdf40b
+
+## [0.2.16]
+
+- Update llama.cpp to ggerganov/llama.cp@a75fa576abba9d37f463580c379e4bbf1e1ad03c
+- Add `set_seed` to `Llama` class by @abetlen in fd41ed3a908761d286102a019a34c2938a15118d
+- Fix server doc arguments by @kjunggithub in #892
+- Fix response_format handler in llava chat handler by @abetlen in b62c44983921197ed10a7d29dc4ba920e9979380
+- Fix default max_tokens, chat completion is now unlimited (to context length) and completion is 16 tokens to match OpenAI defaults by @abetlen in e7962d2c733cbbeec5a37392c81f64185a9a39e8
+- Fix json_schema_to_gbnf helper so that it takes a json schema string as input instead by @abetlen in faeae181b1e868643c0dc28fcf039f077baf0829
+- Add support for $ref and $def in json_schema_to_gbnf to handle more complex function schemas by @abetlen in 770df344369c0630df1be14be9f9e301e7c56d24
+- Update functionary chat handler for new OpenAI api by abetlen in 1b376c62b775b401653facf25a519d116aafe99a
+- Fix add default stop sequence to chatml chat format by @abetlen in b84d76a844149216d511cfd8cdb9827148a1853c
+- Fix sampling bug when logits_all=False by @abetlen in 6f0b0b1b840af846938ed74d0e8170a91c40e617
+
 ## [0.2.15]
 
 - Update llama.cpp to ggerganov/llama.cpp@0a7c980b6f94a049cb804573df2d8092a34df8e4
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -6,6 +6,8 @@ option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python
 
 if (LLAMA_BUILD)
     set(BUILD_SHARED_LIBS "On")
+
+    # Building llama
     if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
         # Need to disable these llama.cpp flags on Apple x86_64,
         # otherwise users may encounter invalid instruction errors
@@ -41,8 +43,14 @@ if (LLAMA_BUILD)
         FILES $<TARGET_RUNTIME_DLLS:llama>
         DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
     )
+
+    # Building llava
     add_subdirectory(vendor/llama.cpp/examples/llava)
     set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
+    # Set CUDA_ARCHITECTURES to OFF on windows
+    if (WIN32)
+        set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
+    endif()
     install(
         TARGETS llava_shared
         LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
diff --git a/examples/notebooks/Functions.ipynb b/examples/notebooks/Functions.ipynb
@@ -1,15 +1,41 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Functions\n",
+    "\n",
+    "The OpenAI compatbile web server in `llama-cpp-python` supports function calling.\n",
+    "\n",
+    "Function calling allows API clients to specify a schema that gives the model a format it should respond in.\n",
+    "Function calling in `llama-cpp-python` works by combining models pretrained for function calling such as [`functionary`](https://huggingface.co/abetlen/functionary-7b-v1-GGUF) with constrained sampling to produce a response that is compatible with the schema.\n",
+    "\n",
+    "Note however that this improves but does not guarantee that the response will be compatible with the schema.\n",
+    "\n",
+    "## Requirements\n",
+    "\n",
+    "Before we begin you will need the following:\n",
+    "\n",
+    "- A running `llama-cpp-python` server with a function calling compatible model. [See here](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)\n",
+    "- The OpenAI Python Client `pip install openai`\n",
+    "- (Optional) The Instructor Python Library `pip install instructor`\n",
+    "\n",
+    "## Function Calling with OpenAI Python Client\n",
+    "\n",
+    "We'll start with a basic demo that only uses the OpenAI Python Client."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "ChatCompletion(id='chatcmpl-b6dcbb47-1120-4761-8cd9-83542c97647b', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content=\"The current temperature in San Francisco is 72 degrees Fahrenheit. It's a sunny day with clear skies, making it perfect for outdoor activities.\\n \", role='assistant', function_call=None, tool_calls=None))], created=1699602158, model='gpt-3.5-turbo-1106', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=38, prompt_tokens=135, total_tokens=173))\n"
+      "ChatCompletion(id='chatcmpl-a2d9eb9f-7354-472f-b6ad-4d7a807729a3', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='The current weather in San Francisco is **72°F** (22°C).\\n ', role='assistant', function_call=None, tool_calls=None))], created=1699638365, model='gpt-3.5-turbo-1106', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=22, prompt_tokens=136, total_tokens=158))\n"
      ]
     }
    ],
@@ -20,7 +46,7 @@
     "\n",
     "client = openai.OpenAI(\n",
     "    api_key = \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\", # can be anything\n",
-    "    base_url = \"http://100.64.159.73:8000/v1\"\n",
+    "    base_url = \"http://100.64.159.73:8000/v1\" # NOTE: Replace with IP address and port of your llama-cpp-python server\n",
     ")\n",
     "\n",
     "# Example dummy function hard coded to return the same weather\n",
@@ -100,9 +126,32 @@
     "print(run_conversation())"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Function Calling with Instructor\n",
+    "\n",
+    "The above example is a bit verbose and requires you to manually verify the schema.\n",
+    "\n",
+    "For our next examples we'll use the `instructor` library to simplify the process and accomplish a number of different tasks with function calling.\n",
+    "\n",
+    "You'll first need to install the [`instructor`](https://github.com/jxnl/instructor/).\n",
+    "\n",
+    "You can do so by running the following command in your terminal:\n",
+    "\n",
+    "```bash\n",
+    "pip install instructor\n",
+    "```\n",
+    "\n",
+    "Below we'll go through a few basic examples taken directly from the [instructor cookbook](https://jxnl.github.io/instructor/)\n",
+    "\n",
+    "## Basic Usage"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -139,11 +188,28 @@
     "print(user)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Classification\n",
+    "\n",
+    "### Single-Label Classification"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "class_label=<Labels.SPAM: 'spam'>\n"
+     ]
+    }
+   ],
    "source": [
     "import enum\n",
     "\n",
@@ -172,19 +238,27 @@
     "    )  # type: ignore\n",
     "\n",
     "prediction = classify(\"Hello there I'm a Nigerian prince and I want to give you money\")\n",
-    "assert prediction.class_label == Labels.SPAM"
+    "assert prediction.class_label == Labels.SPAM\n",
+    "print(prediction)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Multi-Label Classification"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "class_labels=[<MultiLabels.BILLING: 'billing'>, <MultiLabels.TECH_ISSUE: 'tech_issue'>]\n"
+      "class_labels=[<MultiLabels.TECH_ISSUE: 'tech_issue'>, <MultiLabels.BILLING: 'billing'>]\n"
      ]
     }
    ],
@@ -223,16 +297,27 @@
     "print(prediction)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Self-Critique"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "question='What is the meaning of life?' answer='The meaning of life, according to the Devil, is to live a life of sin and debauchery.'\n"
+      "question='What is the meaning of life?' answer='According to the Devil, the meaning of life is to live a life of sin and debauchery.'\n",
+      "1 validation error for QuestionAnswerNoEvil\n",
+      "answer\n",
+      "  Assertion failed, The statement promotes sin and debauchery, which can be considered objectionable. [type=assertion_error, input_value='According to the Devil, ... of sin and debauchery.', input_type=str]\n",
+      "    For further information visit https://errors.pydantic.dev/2.3/v/assertion_error\n"
      ]
     }
    ],
@@ -294,6 +379,13 @@
     "    print(e)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Answering Questions with Validated Citations"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 42,
@@ -366,13 +458,6 @@
     "qa = ask_ai(question, context)\n",
     "print(qa)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.15"
+__version__ = "0.2.18"
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -1019,27 +1019,26 @@ def eval(self, tokens: Sequence[int]):
         """
         assert self._ctx.ctx is not None
         assert self._batch.batch is not None
-        n_ctx = self._n_ctx
+        self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
         for i in range(0, len(tokens), self.n_batch):
             batch = tokens[i : min(len(tokens), i + self.n_batch)]
-            n_past = min(n_ctx - len(batch), self.n_tokens)
+            n_past = self.n_tokens
             n_tokens = len(batch)
-            self._ctx.kv_cache_seq_rm(-1, n_past, -1)
             self._batch.set_batch(
                 batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
             )
             self._ctx.decode(self._batch)
             # Save tokens
-            self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch
+            self.input_ids[n_past : n_past + n_tokens] = batch
             # Save logits
-            rows = n_tokens if self.context_params.logits_all else 1
+            rows = n_tokens
             cols = self._n_vocab
             offset = (
                 0 if self.context_params.logits_all else n_tokens - 1
             )  # NOTE: Only save the last token logits if logits_all is False
-            self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape(
+            self.scores[n_past + offset : n_past + n_tokens, :].reshape(
                 -1
-            )[:] = self._ctx.get_logits()[: rows * cols]
+            )[:] = self._ctx.get_logits()[offset * cols: rows * cols]
             # Update n_tokens
             self.n_tokens += n_tokens
 
diff --git a/tests/test_llama.py b/tests/test_llama.py
@@ -1,4 +1,7 @@
+import ctypes
+
 import pytest
+
 import llama_cpp
 
 MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama.gguf"
@@ -36,19 +39,20 @@ def test_llama_cpp_tokenization():
 
 
 def test_llama_patch(monkeypatch):
-    llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
+    n_ctx = 128
+    llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx)
     n_vocab = llama_cpp.llama_n_vocab(llama._model.model)
+    assert n_vocab == 32000
 
     ## Set up mock function
-    def mock_eval(*args, **kwargs):
+    def mock_decode(*args, **kwargs):
         return 0
 
     def mock_get_logits(*args, **kwargs):
-        return (llama_cpp.c_float * n_vocab)(
-            *[llama_cpp.c_float(0) for _ in range(n_vocab)]
-        )
+        size = n_vocab * n_ctx
+        return (llama_cpp.c_float * size)()
 
-    monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_eval)
+    monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_decode)
     monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits)
 
     output_text = " jumps over the lazy dog."
@@ -126,19 +130,19 @@ def test_llama_pickle():
 
 
 def test_utf8(monkeypatch):
-    llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
+    n_ctx = 512
+    llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx, logits_all=True)
     n_vocab = llama.n_vocab()
 
     ## Set up mock function
-    def mock_eval(*args, **kwargs):
+    def mock_decode(*args, **kwargs):
         return 0
 
     def mock_get_logits(*args, **kwargs):
-        return (llama_cpp.c_float * n_vocab)(
-            *[llama_cpp.c_float(0) for _ in range(n_vocab)]
-        )
+        size = n_vocab * n_ctx
+        return (llama_cpp.c_float * size)()
 
-    monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_eval)
+    monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_decode)
     monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits)
 
     output_text = "😀"
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 875fb42871a0f5a88fbe31a0b5edd697b84038e4
+Subproject commit 6bb4908a17150b49373b5f977685b2e180a04f6f