Skip to content

Commit 3cb696c

Browse files
committed
Merge commit 'f2901d840e15ce2770eac300172471aa029c5fd5' into main
2 parents 3337a98 + f2901d8 commit 3cb696c

File tree

8 files changed

+160
-39
lines changed

8 files changed

+160
-39
lines changed

.github/workflows/build-and-release.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ jobs:
3333
3434
- name: Build wheels
3535
run: python -m cibuildwheel --output-dir wheelhouse
36+
env:
37+
# disable repair
38+
CIBW_REPAIR_WHEEL_COMMAND: ""
3639

3740
- uses: actions/upload-artifact@v3
3841
with:

CHANGELOG.md

+22
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.2.18]
11+
12+
- Update llama.cpp to ggerganov/llama.cpp@6bb4908a17150b49373b5f977685b2e180a04f6f
13+
14+
## [0.2.17]
15+
16+
- Update llama.cpp to ggerganov/llama.cpp@df9d1293defe783f42bc83af732d3c670552c541
17+
- Hotfix: Set `CUDA_ARCHITECTURES=OFF` for `llava_shared` target on Windows by @abetlen in 4388f3341413110217b98c4f097ac5c590bdf40b
18+
19+
## [0.2.16]
20+
21+
- Update llama.cpp to ggerganov/llama.cp@a75fa576abba9d37f463580c379e4bbf1e1ad03c
22+
- Add `set_seed` to `Llama` class by @abetlen in fd41ed3a908761d286102a019a34c2938a15118d
23+
- Fix server doc arguments by @kjunggithub in #892
24+
- Fix response_format handler in llava chat handler by @abetlen in b62c44983921197ed10a7d29dc4ba920e9979380
25+
- Fix default max_tokens, chat completion is now unlimited (to context length) and completion is 16 tokens to match OpenAI defaults by @abetlen in e7962d2c733cbbeec5a37392c81f64185a9a39e8
26+
- Fix json_schema_to_gbnf helper so that it takes a json schema string as input instead by @abetlen in faeae181b1e868643c0dc28fcf039f077baf0829
27+
- Add support for $ref and $def in json_schema_to_gbnf to handle more complex function schemas by @abetlen in 770df344369c0630df1be14be9f9e301e7c56d24
28+
- Update functionary chat handler for new OpenAI api by abetlen in 1b376c62b775b401653facf25a519d116aafe99a
29+
- Fix add default stop sequence to chatml chat format by @abetlen in b84d76a844149216d511cfd8cdb9827148a1853c
30+
- Fix sampling bug when logits_all=False by @abetlen in 6f0b0b1b840af846938ed74d0e8170a91c40e617
31+
1032
## [0.2.15]
1133

1234
- Update llama.cpp to ggerganov/llama.cpp@0a7c980b6f94a049cb804573df2d8092a34df8e4

CMakeLists.txt

+8
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python
66

77
if (LLAMA_BUILD)
88
set(BUILD_SHARED_LIBS "On")
9+
10+
# Building llama
911
if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
1012
# Need to disable these llama.cpp flags on Apple x86_64,
1113
# otherwise users may encounter invalid instruction errors
@@ -41,8 +43,14 @@ if (LLAMA_BUILD)
4143
FILES $<TARGET_RUNTIME_DLLS:llama>
4244
DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
4345
)
46+
47+
# Building llava
4448
add_subdirectory(vendor/llama.cpp/examples/llava)
4549
set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
50+
# Set CUDA_ARCHITECTURES to OFF on windows
51+
if (WIN32)
52+
set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
53+
endif()
4654
install(
4755
TARGETS llava_shared
4856
LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp

examples/notebooks/Functions.ipynb

+103-18
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,41 @@
11
{
22
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Functions\n",
8+
"\n",
9+
"The OpenAI compatbile web server in `llama-cpp-python` supports function calling.\n",
10+
"\n",
11+
"Function calling allows API clients to specify a schema that gives the model a format it should respond in.\n",
12+
"Function calling in `llama-cpp-python` works by combining models pretrained for function calling such as [`functionary`](https://huggingface.co/abetlen/functionary-7b-v1-GGUF) with constrained sampling to produce a response that is compatible with the schema.\n",
13+
"\n",
14+
"Note however that this improves but does not guarantee that the response will be compatible with the schema.\n",
15+
"\n",
16+
"## Requirements\n",
17+
"\n",
18+
"Before we begin you will need the following:\n",
19+
"\n",
20+
"- A running `llama-cpp-python` server with a function calling compatible model. [See here](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)\n",
21+
"- The OpenAI Python Client `pip install openai`\n",
22+
"- (Optional) The Instructor Python Library `pip install instructor`\n",
23+
"\n",
24+
"## Function Calling with OpenAI Python Client\n",
25+
"\n",
26+
"We'll start with a basic demo that only uses the OpenAI Python Client."
27+
]
28+
},
329
{
430
"cell_type": "code",
5-
"execution_count": 29,
31+
"execution_count": 4,
632
"metadata": {},
733
"outputs": [
834
{
935
"name": "stdout",
1036
"output_type": "stream",
1137
"text": [
12-
"ChatCompletion(id='chatcmpl-b6dcbb47-1120-4761-8cd9-83542c97647b', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content=\"The current temperature in San Francisco is 72 degrees Fahrenheit. It's a sunny day with clear skies, making it perfect for outdoor activities.\\n \", role='assistant', function_call=None, tool_calls=None))], created=1699602158, model='gpt-3.5-turbo-1106', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=38, prompt_tokens=135, total_tokens=173))\n"
38+
"ChatCompletion(id='chatcmpl-a2d9eb9f-7354-472f-b6ad-4d7a807729a3', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='The current weather in San Francisco is **72°F** (22°C).\\n ', role='assistant', function_call=None, tool_calls=None))], created=1699638365, model='gpt-3.5-turbo-1106', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=22, prompt_tokens=136, total_tokens=158))\n"
1339
]
1440
}
1541
],
@@ -20,7 +46,7 @@
2046
"\n",
2147
"client = openai.OpenAI(\n",
2248
" api_key = \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\", # can be anything\n",
23-
" base_url = \"http://100.64.159.73:8000/v1\"\n",
49+
" base_url = \"http://100.64.159.73:8000/v1\" # NOTE: Replace with IP address and port of your llama-cpp-python server\n",
2450
")\n",
2551
"\n",
2652
"# Example dummy function hard coded to return the same weather\n",
@@ -100,9 +126,32 @@
100126
"print(run_conversation())"
101127
]
102128
},
129+
{
130+
"cell_type": "markdown",
131+
"metadata": {},
132+
"source": [
133+
"# Function Calling with Instructor\n",
134+
"\n",
135+
"The above example is a bit verbose and requires you to manually verify the schema.\n",
136+
"\n",
137+
"For our next examples we'll use the `instructor` library to simplify the process and accomplish a number of different tasks with function calling.\n",
138+
"\n",
139+
"You'll first need to install the [`instructor`](https://github.com/jxnl/instructor/).\n",
140+
"\n",
141+
"You can do so by running the following command in your terminal:\n",
142+
"\n",
143+
"```bash\n",
144+
"pip install instructor\n",
145+
"```\n",
146+
"\n",
147+
"Below we'll go through a few basic examples taken directly from the [instructor cookbook](https://jxnl.github.io/instructor/)\n",
148+
"\n",
149+
"## Basic Usage"
150+
]
151+
},
103152
{
104153
"cell_type": "code",
105-
"execution_count": 30,
154+
"execution_count": 5,
106155
"metadata": {},
107156
"outputs": [
108157
{
@@ -139,11 +188,28 @@
139188
"print(user)"
140189
]
141190
},
191+
{
192+
"cell_type": "markdown",
193+
"metadata": {},
194+
"source": [
195+
"## Text Classification\n",
196+
"\n",
197+
"### Single-Label Classification"
198+
]
199+
},
142200
{
143201
"cell_type": "code",
144-
"execution_count": 31,
202+
"execution_count": 7,
145203
"metadata": {},
146-
"outputs": [],
204+
"outputs": [
205+
{
206+
"name": "stdout",
207+
"output_type": "stream",
208+
"text": [
209+
"class_label=<Labels.SPAM: 'spam'>\n"
210+
]
211+
}
212+
],
147213
"source": [
148214
"import enum\n",
149215
"\n",
@@ -172,19 +238,27 @@
172238
" ) # type: ignore\n",
173239
"\n",
174240
"prediction = classify(\"Hello there I'm a Nigerian prince and I want to give you money\")\n",
175-
"assert prediction.class_label == Labels.SPAM"
241+
"assert prediction.class_label == Labels.SPAM\n",
242+
"print(prediction)"
243+
]
244+
},
245+
{
246+
"cell_type": "markdown",
247+
"metadata": {},
248+
"source": [
249+
"### Multi-Label Classification"
176250
]
177251
},
178252
{
179253
"cell_type": "code",
180-
"execution_count": 32,
254+
"execution_count": 12,
181255
"metadata": {},
182256
"outputs": [
183257
{
184258
"name": "stdout",
185259
"output_type": "stream",
186260
"text": [
187-
"class_labels=[<MultiLabels.BILLING: 'billing'>, <MultiLabels.TECH_ISSUE: 'tech_issue'>]\n"
261+
"class_labels=[<MultiLabels.TECH_ISSUE: 'tech_issue'>, <MultiLabels.BILLING: 'billing'>]\n"
188262
]
189263
}
190264
],
@@ -223,16 +297,27 @@
223297
"print(prediction)"
224298
]
225299
},
300+
{
301+
"cell_type": "markdown",
302+
"metadata": {},
303+
"source": [
304+
"## Self-Critique"
305+
]
306+
},
226307
{
227308
"cell_type": "code",
228-
"execution_count": 33,
309+
"execution_count": 13,
229310
"metadata": {},
230311
"outputs": [
231312
{
232313
"name": "stdout",
233314
"output_type": "stream",
234315
"text": [
235-
"question='What is the meaning of life?' answer='The meaning of life, according to the Devil, is to live a life of sin and debauchery.'\n"
316+
"question='What is the meaning of life?' answer='According to the Devil, the meaning of life is to live a life of sin and debauchery.'\n",
317+
"1 validation error for QuestionAnswerNoEvil\n",
318+
"answer\n",
319+
" Assertion failed, The statement promotes sin and debauchery, which can be considered objectionable. [type=assertion_error, input_value='According to the Devil, ... of sin and debauchery.', input_type=str]\n",
320+
" For further information visit https://errors.pydantic.dev/2.3/v/assertion_error\n"
236321
]
237322
}
238323
],
@@ -294,6 +379,13 @@
294379
" print(e)"
295380
]
296381
},
382+
{
383+
"cell_type": "markdown",
384+
"metadata": {},
385+
"source": [
386+
"## Answering Questions with Validated Citations"
387+
]
388+
},
297389
{
298390
"cell_type": "code",
299391
"execution_count": 42,
@@ -366,13 +458,6 @@
366458
"qa = ask_ai(question, context)\n",
367459
"print(qa)"
368460
]
369-
},
370-
{
371-
"cell_type": "code",
372-
"execution_count": null,
373-
"metadata": {},
374-
"outputs": [],
375-
"source": []
376461
}
377462
],
378463
"metadata": {

llama_cpp/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.2.15"
4+
__version__ = "0.2.18"

llama_cpp/llama.py

+6-7
Original file line numberDiff line numberDiff line change
@@ -1019,27 +1019,26 @@ def eval(self, tokens: Sequence[int]):
10191019
"""
10201020
assert self._ctx.ctx is not None
10211021
assert self._batch.batch is not None
1022-
n_ctx = self._n_ctx
1022+
self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
10231023
for i in range(0, len(tokens), self.n_batch):
10241024
batch = tokens[i : min(len(tokens), i + self.n_batch)]
1025-
n_past = min(n_ctx - len(batch), self.n_tokens)
1025+
n_past = self.n_tokens
10261026
n_tokens = len(batch)
1027-
self._ctx.kv_cache_seq_rm(-1, n_past, -1)
10281027
self._batch.set_batch(
10291028
batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
10301029
)
10311030
self._ctx.decode(self._batch)
10321031
# Save tokens
1033-
self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch
1032+
self.input_ids[n_past : n_past + n_tokens] = batch
10341033
# Save logits
1035-
rows = n_tokens if self.context_params.logits_all else 1
1034+
rows = n_tokens
10361035
cols = self._n_vocab
10371036
offset = (
10381037
0 if self.context_params.logits_all else n_tokens - 1
10391038
) # NOTE: Only save the last token logits if logits_all is False
1040-
self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape(
1039+
self.scores[n_past + offset : n_past + n_tokens, :].reshape(
10411040
-1
1042-
)[:] = self._ctx.get_logits()[: rows * cols]
1041+
)[:] = self._ctx.get_logits()[offset * cols: rows * cols]
10431042
# Update n_tokens
10441043
self.n_tokens += n_tokens
10451044

tests/test_llama.py

+16-12
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1+
import ctypes
2+
13
import pytest
4+
25
import llama_cpp
36

47
MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama.gguf"
@@ -36,19 +39,20 @@ def test_llama_cpp_tokenization():
3639

3740

3841
def test_llama_patch(monkeypatch):
39-
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
42+
n_ctx = 128
43+
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx)
4044
n_vocab = llama_cpp.llama_n_vocab(llama._model.model)
45+
assert n_vocab == 32000
4146

4247
## Set up mock function
43-
def mock_eval(*args, **kwargs):
48+
def mock_decode(*args, **kwargs):
4449
return 0
4550

4651
def mock_get_logits(*args, **kwargs):
47-
return (llama_cpp.c_float * n_vocab)(
48-
*[llama_cpp.c_float(0) for _ in range(n_vocab)]
49-
)
52+
size = n_vocab * n_ctx
53+
return (llama_cpp.c_float * size)()
5054

51-
monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_eval)
55+
monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_decode)
5256
monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits)
5357

5458
output_text = " jumps over the lazy dog."
@@ -126,19 +130,19 @@ def test_llama_pickle():
126130

127131

128132
def test_utf8(monkeypatch):
129-
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
133+
n_ctx = 512
134+
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx, logits_all=True)
130135
n_vocab = llama.n_vocab()
131136

132137
## Set up mock function
133-
def mock_eval(*args, **kwargs):
138+
def mock_decode(*args, **kwargs):
134139
return 0
135140

136141
def mock_get_logits(*args, **kwargs):
137-
return (llama_cpp.c_float * n_vocab)(
138-
*[llama_cpp.c_float(0) for _ in range(n_vocab)]
139-
)
142+
size = n_vocab * n_ctx
143+
return (llama_cpp.c_float * size)()
140144

141-
monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_eval)
145+
monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_decode)
142146
monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits)
143147

144148
output_text = "😀"

vendor/llama.cpp

0 commit comments

Comments
 (0)