From 4ec35390d72faba70942b9605dfcbde2bda0bdad Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Sun, 15 Oct 2023 19:35:23 +0200 Subject: [PATCH 01/26] llava v1.5 integration --- examples/multimodal/llava.py | 75 +++++++++++++++++++++++++ examples/multimodal/overfitting_lc.png | Bin 0 -> 5982 bytes llama_cpp/llama.py | 2 + llama_cpp/llama_cpp.py | 67 ++++++++++++++++++++++ llama_cpp/server/app.py | 4 ++ vendor/llama.cpp | 2 +- 6 files changed, 149 insertions(+), 1 deletion(-) create mode 100644 examples/multimodal/llava.py create mode 100644 examples/multimodal/overfitting_lc.png diff --git a/examples/multimodal/llava.py b/examples/multimodal/llava.py new file mode 100644 index 000000000..a209625c1 --- /dev/null +++ b/examples/multimodal/llava.py @@ -0,0 +1,75 @@ +import ctypes +import json +import argparse +import os +import array +import sys + +from llama_cpp import (Llama, clip_model_load, llava_image_embed_make_with_filename, llava_image_embed_make_with_bytes, + llava_image_embed_p, llava_image_embed_free, llava_validate_embed_size, llava_eval_image_embed) + +parser = argparse.ArgumentParser() +parser.add_argument("-m", "--model", type=str, default="../models/llava-v1.5-7b/ggml-model-q5_k.gguf") +parser.add_argument("--mmproj", type=str, default="llava-v1.5-7b/mmproj-model-f16.gguf") +parser.add_argument("-t", "--temp", type=float, default=0.1) +parser.add_argument("-p", "--prompt", type=str, default="Describe this image in detail.") +args = parser.parse_args() + +print(f"loading clip model from {args.mmproj}") +if not os.path.exists(args.mmproj): + raise FileNotFoundError(args.mmproj) +ctx_clip = clip_model_load(args.mmproj.encode('utf-8')) + +image_path = os.path.join(os.path.dirname(__file__), "overfitting_lc.png") +if not os.path.exists(image_path): + raise FileNotFoundError(image_path) +image_embed = llava_image_embed_make_with_filename(ctx_clip=ctx_clip, n_threads=1, filename=image_path.encode('utf8')) + +def load_image_embed_from_file_bytes(image_path: str) -> llava_image_embed_p: + with open(image_path, 'rb') as file: + image_bytes = file.read() + bytes_length = len(image_bytes) + data_array = array.array('B', image_bytes) + c_ubyte_ptr = (ctypes.c_ubyte * len(data_array)).from_buffer(data_array) + return llava_image_embed_make_with_bytes(ctx_clip=ctx_clip, n_threads=1, image_bytes=c_ubyte_ptr, image_bytes_length=bytes_length) + +print(f"loading llm model from {args.model}") +if not os.path.exists(args.model): + raise FileNotFoundError(args.model) +llm = Llama(model_path=args.model, n_ctx=2048, n_gpu_layers=1) # longer context needed for image embeds + +if not llava_validate_embed_size(llm.ctx, ctx_clip): + raise RuntimeError("llm and mmproj model embed size mismatch") + +# eval system prompt +system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n" +llm.eval(llm.tokenize(system_prompt.encode('utf8'))) +llm.eval(llm.tokenize("\nUSER: ".encode('utf8'))) + +# eval image embed +n_past = ctypes.c_int(llm.n_tokens) +n_past_p = ctypes.byref(n_past) +llava_eval_image_embed(llm.ctx, image_embed, llm.n_batch, n_past_p) +llm.n_tokens = n_past.value +llava_image_embed_free(image_embed) + +# eval prompt +prompt = 'Describe the visual content of this image' +llm.eval(llm.tokenize(prompt.encode('utf8'))) +llm.eval(llm.tokenize("\nASSISTANT:".encode('utf8'))) + +# get output +print("\n") +max_target_len = 256 +for i in range(max_target_len): + t_id = llm.sample(temp=0.1) + t = llm.detokenize([t_id]).decode('utf8') + if t == "": + break + print(t, end="") + sys.stdout.flush() + llm.eval([t_id]) + +print("\n") +print("done") + diff --git a/examples/multimodal/overfitting_lc.png b/examples/multimodal/overfitting_lc.png new file mode 100644 index 0000000000000000000000000000000000000000..591b34c68e1ca19bab4d790de6c98e70e773fdf9 GIT binary patch literal 5982 zcmV-k7oq5hP)p{QaDqod5j#`p>m=inHC_-Hwiq{N&cw*4FTkP}$kpNMxHz zXPs?rZSU~*{NK*^`1@FKr03`7MPizXiHY^Gg}Au5rnAk3m%H(mS!#i)Z^*w z^Wo+0Z-uZ!V3}iMV{mYA{`d0!@aUnh%khy@TXd!7>G9*|@M>ym|JvOD_4fb${zODX z@|s{%ZJ~B{c3@y&;o;%`-QWNG`@X)u^{;|GT$cF2o%5Y${_^Yo_3!cX_uk^|@sn29 z-01w`(*NM%&CSjF)4Jp1B&Zl#!8&3;zNJvP^%F0eoPWi*4^{syW>fVHeg!#y+^Pz0_xsrH{wf3`#``gEfn!We7j*Xqa zn3$OKqj1N^$JN^Bke|U#Xr8gX)avT$$Isw%b94Xt`v2nPs;a6uS(N_s?fcrr^{9CA zpK1EdvaGDEq@<*Ple%Ggsgk0?``yah-|CsD#>CCu!OGlwdwX47UH{kG|M2qv=IFMz zw$jql_q&z!rgXl?+5i9mR#sNg*5&`}?fvcK_r00@?&VWcQ~TDwt+~-=eXG2@y#IfN z|MT?!*4WU{(D3l^|M&QPkhuBBr_Ti>izBG;NsaB2|BRFWh>!lr%=N2#`O2&&CMKDgng6S>|GK{ag^d5Qx2>(M{QdnvKtTV_(G&{{ zsWV0LcS8QPr~X<|`A#*CYi$e$2HJp;(y6=IS8Hrmg^!Q2o{RSO000yjNklvzo^!9Z=k_+e=iGVx5Jc2~3E3eeS^rO)?C04l`LWi!-?i4e-WMPu zA|fIpA|j$m%mN01Ag=~9*#v^zux1ii3c`{M6SJfM01Pt%W-M4UoNu4Y7yv?Y4RULk z5IsF-u*{I4kqzr^Pad;2UP7(BOD%FALftaXV6z)RBO6w8GX-{LB^>MTnGJH#nVZKL zBm;2_SQ#Gy)XGM*p18QX*Am9<7vt-P2pZY2p4tP;pX345P)N@J0Lw)_&IFBYSiKpy z%tFpF2Adt20T~6%+QnwZ_BjO2n_?RnV$fn)eMSRd@3Tc;KLF(!b@h$|1kIb`oa6xP z#t=6NV0Lox02`nx3+_4)GDJi~L_|bHL_|bHM7pVMiwFaG#a`hvgHVuDH8+0TMkvUq zfZJ!!CXlWrm#FqXpFMkntog|YYry`-6Z<>@LOxp5I|kI`H@oK(&BXPNff^cp|8EE^ zx%Hdh5?FF;&fgPQa_hYhv;@|Ck69{j^sn}439KgGC6zb&;qSBr*6OVAeZ;`O4Hm-* z03dkaIoDjpjehyp8V-wJ{ZQ(j8m+NwGEr@#pZrF{VJ$f19VNXo>EIn9lbdm2>7`#z zy)NxrRmG+ARNCmL|D^E-ej)dAn$&FV=gpl@4E(sTjFtu-VgdkWU9(-2PhgGfIyhCw zb@2fJ`Kws_RoLja|DXkamUu^8wbQV-WqN@a^zoh2NY}t9y7ANupq^ zXV&I0@k0(6yW58v<3;Gr04$tgxVt&vPG&PCb?&zoU38NS#@F6M&2Hc$$Gkkp8jAho zM2M)Tol*}hP}NqOnE+tE&bESnvvuYDF>Z)*^UU;XV7!evcrw1>1+2Wx*`S&&Kn>ht z=NWt<9fmc#Az2v$+{s|DpEN>JXTeYtEbw<&^&0^|(t&Uf+8dqo^B6aV67D+eo(*h^ zk55=}_XVuRi)^WZU*r>?^Q3nK*1abIz`$)Y_RK-RUZ1HhSP+K=E_DKcBwjJ?zUi0- zo^p0?IAMr}%#+mM?h#l6=cKTPe!as)!b@0#GnWGex0z~(h3E)w9{`|&6-@U|;s;~K zI+SNP+XJ0GfM`X*3s?@fPr^8-8E%<_a9H^fkWd5E>;}~n0?AJ~9QDJZuBRSO2LO}v z!sj&mrejCZSmwryfJG~Mdv_azAjAgT z01yS$iDu6RhU}NI-L?VLBKLYXfO&@rh{`4EghlPL3S$Dm1yj>1S{r@;Z?$A1J=(}W zvQB0I5Nz-~P3uaaR)!q$H^=Ro4uErzwT{kR>i7Ssg|HAA!TlNlOx6iqXf#`2eX5PH zP;z)@3;^sq%_^g>(flh~35$x#j!g#uo2)GQjQj^$8dK`jy*F$D0Ko>~T-p{Ov^J)c zsxI6i1|T?Sw~4Opo{xXlQdr=3S@$yln6I;Qp=kkP%vFa;&^!-S!a}60cefFMd0SKH z3PqZxCQ^d=O<4yLRS65em!+Xw`%J^>c_peTrj%n-#A2O(kgrl$de>7A#{j?)&$YC8 z^6D}0NloFxxf_&muH}k@m$Z4eL<7h_$XZVatuMcs`~hhG+*H`%r`)%f^U%OUt3!MM zM%V04(GYL?$sZ=0)%=QLyVFWXQOOBQe9Su834lqk@Sx_s@{P(mgjp5JyNU81M_I7Z ztDgZNse`vc>37+8|2Ubb@~hVKO4lUO;P%LUc4vXB_ZbR6)qzq!W!s%p1D~`e#d8xq zk$LhTbSgQ_EdhWs&#WZPfko{)hf63xu-|j7lF>Dv{eF^A%_>_Z zEUut!DNd0ibZuU)(E##YSabJ510=Z6#7ilnF>Fc^SX8xj7zJQflkGaCHub;#=ShK8 zQG9=H8J)473LVa5hFTvJ128GFEmFn|Ooq)e@$0Z%yMf-&1I13V@^{4bPX{o6lW>j6 zl!ldbikEVY?!ZL_kNx__dbQ~RNbF!Kv0n4(pH$tgDzb#Z9<+xoDtbmbJ=Xg=1z=u> z?Sp(wO_}(ewq5I|f(I)Ki`wpg`6vKXZS{OGQCI%T$0{9xn|P(qRT4U^@acOh$}2V+ z0Fq2oCTc%FR^dt3`3J*U>#I}~IIMW80;8LOSA7A%rkHnRVtq?>2Hxbz0lTe!Mk;#x z1oAqFL#>wZ0SL-W!zVUP{NV3Y^lPd%@k*bo&S9QN5{&6ftdge#5ZIKaOo-OBuTn*c49WDGH{D7ITw#sQ`nETqM9+b95>vXra| z&{9Dz8iCDRp{YwEF!f*|@^JRo005hCPsK+f?FK%->Xgv4DN%qa2@Bi({nIf#5*2SyWt&VK9JFvkVn6j{dHY+JDR2AD@ zQfwh)&u27xRvxEqvo6g7rZg;s#z{j?1vXoGE_>883|Ft?r4)TWNs~?`VDXQ|JrV73P43z(Sg<_i<^~43i?()(i5!6!YIx-d`^} z%`$aK6hL!eAuWR2CY|H&aq-NO*WprnB5hK(N@#j*(zlqD%_h0f-@in9#o3>dB4@fl zWexo5O`fdtHUelGEGjDXaS1g%J=@AElIy>%)Z(9s%{rFn`XqI%X)B$llxo{Ke4rYb z7c6`x&*8)?b0*vz$L17|x5s55Yg1F22*vKZq{57_wZ#m5Q8^vQvIo}6++O97%2Q(cP_+5pe;*r0~w%#AVbA6&{ zV95Lp&umLKRYA?7DQmayMyvkmBY!0o3A0@2o8Bh6JQJ3lvQhsAI9(~uePaV}zTB?c zv!R%^YV2R9NHE*1q=Q+klxv?)g+$-9<%3Ui4eTD3v%|b8h1K-EKv@Hy=y#qc+^Xu0 zh)IWqlFQj`qtd}lD+{(Qe!fR3Sc#W9c*5WZ^8syxg_u-s-{|ByCi@St4#@na@`QE^ z5?7xuWrY`2nE+Y{OKf=ho zkzVTiHKoF}F0Xfvw;e|@*+19RHoT~8YN4XJ;0M*mO^tbdG(9h7r4-qi3*>Z4%`Y4Y z_GE6s8$GH(DZ+Mgy&se;<2S>dc$29OJHZH-Y_f3Ock_4E7qPdg|quY9T zH}Mr*W$Sph;d3v&7>=?JXL8D#)~499=2lE4o32V&K)2WBNLecee!=0w`)iy0R_}Ym zSDlvlxySzVYi(^k51dz7Alj6KHGNm$)ihZ*rd@b&YAx%2_~}j8Tz>!kJ`*D&j>-Ha znZi1j7@9K9|^2iJcwlLvHKRO zocX%ldStzMELz%_AW_+oSom^->@}P99f9?ljFPRc%I;hA2HW&2yCG_tIb z7+NUP-y#NHrmC%6WzZs}k!6p>YO6zUu0d!`X=J)-Ewy%igVwY?1eQ!uOT910=)E?P z{vClOTg;nYu}AfYftPbu-R7OT@Rj{UjW3O?LK(IB`$svA`rwl<39PaER#dis8wHKm zq`(4o^l)JJ%lIv=DcPqQ=^c&r+9f6iUQVSd70^2&GP-mc+A2rVgKt5RCW3d5p_ zftPpcbId9>QV+mqq-VWdjECIEeG+1`nOBN6V zFQ=lztpAS~czMN7{@?#uND)|a&J_RqFWGHQ1eSdI@V`>IyT+{DC~$k}giw~t@|ss9IOrfOsRzF>qVQ4Vo46!y znT8VB=UU`Zq8~tozaAOIBz?}MxoGra zc5tmW_lXHCxy0z}3gm|MNeC>t@@FwiHfMXewk7qe4fEe?mFl{=x-E^N?XZ+S$!f%p zE;+N~vbVSQPJh>9OZ%cD7l@}DX(ueY4h`_PKU{J(!ry!EUT(ypuF!|wUvDo-^W`(N z3KpGd?50N^jq0xMYB{`f@7_YM?9{^%u`O3Oc0CI1-*vP5_}6{g7na1Nr$tBl^7&#W z!$|XB(Ow$aXu6ocAhIOtX6TZzmWb5s>})QVyK|?vx3{;KmzS59S72aZU|?WeT-;`j zgQe<>A&dplF-NyYwH-fmbLp=B>d@rmN5_sW>FV0CWAj6egQd<~G=mNr21}i%!e;-N z&^SZ(tycZF=*Bzxldu2yi@w_A^X}OjPa9=!hn#%(EjoMl-O1N~{`pFn9FH|h*E zXBg618d>3Cc;m-}N=>p^>dYpY7gTOW47|KWvhKhl z`QSCgz-t_=k4YLk`K3f@WZi?MsZvi&>7QtM%)q&mFl@Ld`=+Ga4PdUkLk#@z%%!dp zm5#Gz-()Up2RQBnnh)z6dY)rQNksAGmVm@CV1@)>Fqv#n#zV`E0rt~QLkTQ7fP~#@ zNC6BcW9SwWpp0*^85#sj;nK*mHMoLZ+4JMdAV_L^at(SJa1L@`hDAQP=i*z@{Lah- z5hLL)%ziP*H3*?llTd%|Nf0pL=xAUuq;m#-a3&jhjwdB>yDj4Mo(sU1+nqxU`(HT5 zIkm7n(axcmjs+MnSOn2+H?+^}WpB$ggnJO9^(QyYfu%@kZ;+uuHtV` zkU9<|xvd!RaSwVb(jeF2541dn<0b(H%Lf=R6K!)d25z+i7%XpWWmf`bZrg5Am27}B zyFnxY1_LNiZcK0-8cz;U8f3G+(af$)*0ZMkM4A-*AWJkV5n23GKxA2&7 clip_ctx_p: + """ load mmproj model """ + return _lib.clip_model_load(fname, verbosity) +_lib.clip_model_load.argtypes = [c_char_p, c_int] +_lib.clip_model_load.restype = clip_ctx_p + + +# LLAMA_API void clip_free(struct clip_ctx * ctx); +def clip_free(ctx: clip_ctx_p): + """ free mmproj model """ + _lib.clip_free(ctx) +_lib.clip_free.argtypes = [clip_ctx_p] +_lib.clip_free.restype = None + + +#LLAMA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip); +def llava_validate_embed_size(ctx_llama: llama_context_p, ctx_clip: clip_ctx_p) -> c_bool: + """ sanity check for clip <-> llava embed size match """ + return _lib.llava_validate_embed_size(ctx_llama, ctx_clip) +_lib.llava_validate_embed_size.argtypes = [llama_context_p, clip_ctx_p] +_lib.llava_validate_embed_size.restype = c_bool + + +#LLAMA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); +def llava_image_embed_make_with_bytes(ctx_clip: clip_ctx_p, n_threads: Union[int,c_int], image_bytes: c_uint8_p, image_bytes_length: c_size_t) -> llava_image_embed_p: + """ build an image embed by interpreting image_bytes as the contents of an image file with byte size image_bytes_length. + supported formats (autodetected): JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC (ref https://github.com/nothings/stb) """ + return _lib.llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length) +_lib.llava_image_embed_make_with_bytes.argtypes = [clip_ctx_p, c_int, c_uint8_p, c_size_t] +_lib.llava_image_embed_make_with_bytes.restype = llava_image_embed_p + + +#LLAMA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); +def llava_image_embed_make_with_filename(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], filename: Union[c_char_p, bytes]) -> llava_image_embed_p: + """ build an image embed from a path to an image filename """ + return _lib.llava_image_embed_make_with_filename(ctx_clip, n_threads, filename) +_lib.llava_image_embed_make_with_filename.argtypes = [clip_ctx_p, c_int, c_char_p] +_lib.llava_image_embed_make_with_filename.restype = llava_image_embed_p + +#LLAMA_API void llava_image_embed_free(struct llava_image_embed * embed); +def llava_image_embed_free(embed: llava_image_embed_p): + """ free an embedding made with one of the llava_image_embed_make_ methods """ + _lib.llava_image_embed_free(embed) +_lib.llava_image_embed_free.argtypes = [llava_image_embed_p] +_lib.llava_image_embed_free.restype = None + +#LLAMA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); +def llava_eval_image_embed(ctx: llama_context_p, image_embed: llava_image_embed_p, n_batch: c_int, n_past: c_int_p) -> c_bool: + """ write the image represented by embed into the llama context with batch size n_batch, + starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed.""" + return _lib.llava_eval_image_embed(ctx, image_embed, n_batch, n_past) +_lib.llava_eval_image_embed.argtypes = [llama_context_p, llava_image_embed_p, c_int, c_int_p] +_lib.llava_eval_image_embed.restyle = c_bool + diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 18cd47ce1..30c7a0e63 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -41,6 +41,9 @@ class Settings(BaseSettings): default=None, description="The alias of the model to use for generating completions.", ) + model_mproj: str = Field( + description="For multimodal models (eg Llava), the path to the multimodal projector model." + ) seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.") n_ctx: int = Field(default=2048, ge=1, description="The context size.") n_batch: int = Field( @@ -345,6 +348,7 @@ def create_app(settings: Optional[Settings] = None): global llama llama = llama_cpp.Llama( model_path=settings.model, + model_mproj_path=settings.model_mproj, seed=settings.seed, n_ctx=settings.n_ctx, n_batch=settings.n_batch, diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b8fe4b5cc..5a9155189 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b8fe4b5cc9cb237ca98e5bc51b5d189e3c446d13 +Subproject commit 5a9155189945cd9aa6b98a4a340b38dc93c8d219 From 48f4228c05692936af6b9b6407ccfa8a4be789e4 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 6 Nov 2023 13:25:29 -0500 Subject: [PATCH 02/26] Point llama.cpp to fork --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 7edf0975d..6fe937b38 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "vendor/llama.cpp"] path = vendor/llama.cpp - url = https://github.com/ggerganov/llama.cpp.git + url = https://github.com/damian0815/llama.cpp.git From 61a1e5c18733c9a47dc7d35b124fc6691ad8bba1 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 6 Nov 2023 13:25:43 -0500 Subject: [PATCH 03/26] Add llava shared library target --- CMakeLists.txt | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index c633c0797..8d063708d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,4 +41,23 @@ if (LLAMA_BUILD) FILES $ DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp ) + add_subdirectory(vendor/llama.cpp/examples/llava) + set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava") + install( + TARGETS llava_shared + LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp + RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp + ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp + FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp + RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp + ) + # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374 + install( + TARGETS llava_shared + LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp + RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp + ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp + FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp + RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp + ) endif() From 46ce32326f9005f14ff39020e43b5a0980e2448e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 6 Nov 2023 13:25:51 -0500 Subject: [PATCH 04/26] Fix type --- llama_cpp/llama_cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index a4d21004f..e0dbdf854 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1078,7 +1078,7 @@ def llama_batch_get_one( tokens, # type: Array[llama_token] n_tokens: Union[c_int, int], pos_0: Union[llama_pos, int], - seq_id: llama_seq_id, + seq_id: Union[llama_seq_id, int], ) -> llama_batch: return _lib.llama_batch_get_one(tokens, n_tokens, pos_0, seq_id) From 0d8a91b7944492f28be4cf007ee4a6b4c83a412c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 6 Nov 2023 13:26:15 -0500 Subject: [PATCH 05/26] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 2833a6f63..22f43fca0 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 2833a6f63c1b87c7f4ac574bcf7a15a2f3bf3ede +Subproject commit 22f43fca0ac2237766f825a8ab4aa2d5e19238d0 From 0c950665103e08558be03e4725b387aadfa1bf03 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 6 Nov 2023 13:26:25 -0500 Subject: [PATCH 06/26] Add llava api --- llama_cpp/llava_cpp.py | 232 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 llama_cpp/llava_cpp.py diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py new file mode 100644 index 000000000..5dc4b4201 --- /dev/null +++ b/llama_cpp/llava_cpp.py @@ -0,0 +1,232 @@ +import sys +import os +import ctypes +from ctypes import ( + c_bool, + c_char_p, + c_int, + c_int8, + c_int32, + c_uint8, + c_uint32, + c_size_t, + c_float, + c_double, + c_void_p, + POINTER, + _Pointer, # type: ignore + Structure, + Array, +) +import pathlib +from typing import List, Union + +import llama_cpp.llama_cpp as llama_cpp + +# Load the library +def _load_shared_library(lib_base_name: str): + # Construct the paths to the possible shared library names + _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) + # Searching for the library in the current directory under the name "libllama" (default name + # for llamacpp) and "llama" (default name for this repo) + _lib_paths: List[pathlib.Path] = [] + # Determine the file extension based on the platform + if sys.platform.startswith("linux"): + _lib_paths += [ + _base_path / f"lib{lib_base_name}.so", + ] + elif sys.platform == "darwin": + _lib_paths += [ + _base_path / f"lib{lib_base_name}.so", + _base_path / f"lib{lib_base_name}.dylib", + ] + elif sys.platform == "win32": + _lib_paths += [ + _base_path / f"{lib_base_name}.dll", + _base_path / f"lib{lib_base_name}.dll", + ] + else: + raise RuntimeError("Unsupported platform") + + if "LLAMA_CPP_LIB" in os.environ: + lib_base_name = os.environ["LLAMA_CPP_LIB"] + _lib = pathlib.Path(lib_base_name) + _base_path = _lib.parent.resolve() + _lib_paths = [_lib.resolve()] + + cdll_args = dict() # type: ignore + # Add the library directory to the DLL search path on Windows (if needed) + if sys.platform == "win32" and sys.version_info >= (3, 8): + os.add_dll_directory(str(_base_path)) + if "CUDA_PATH" in os.environ: + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib")) + cdll_args["winmode"] = ctypes.RTLD_GLOBAL + + # Try to load the shared library, handling potential errors + for _lib_path in _lib_paths: + if _lib_path.exists(): + try: + return ctypes.CDLL(str(_lib_path), **cdll_args) + except Exception as e: + raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") + + raise FileNotFoundError( + f"Shared library with base name '{lib_base_name}' not found" + ) + + +# Specify the base name of the shared library to load +_libllava_base_name = "llava" + +# Load the library +_libllava = _load_shared_library(_libllava_base_name) + + +################################################ +# llava.h +################################################ + +# struct clip_ctx; +clip_ctx_p = c_void_p + +# struct llava_image_embed { +# float * embed; +# int n_image_pos; +# }; +class llava_image_embed(Structure): + _fields_ = [ + ("embed", POINTER(c_float)), + ("n_image_pos", c_int), + ] + +# /** sanity check for clip <-> llava embed size match */ +# LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip); +def llava_validate_embed_size(ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p) -> bool: + return _libllava.llava_validate_embed_size(ctx_llama, ctx_clip) + +_libllava.llava_validate_embed_size.argtypes = [llama_cpp.llama_context_p, clip_ctx_p] +_libllava.llava_validate_embed_size.restype = c_bool + +# /** build an image embed from image file bytes */ +# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); +def llava_image_embed_make_with_bytes(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_bytes: bytes, image_bytes_length: Union[c_int, int]) -> "_Pointer[llava_image_embed]": + return _libllava.llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length) + +_libllava.llava_image_embed_make_with_bytes.argtypes = [clip_ctx_p, c_int, POINTER(c_uint8), c_int] +_libllava.llava_image_embed_make_with_bytes.restype = POINTER(llava_image_embed) + +# /** build an image embed from a path to an image filename */ +# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); +def llava_image_embed_make_with_filename(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes) -> "_Pointer[llava_image_embed]": + return _libllava.llava_image_embed_make_with_filename(ctx_clip, n_threads, image_path) + +_libllava.llava_image_embed_make_with_filename.argtypes = [clip_ctx_p, c_int, c_char_p] +_libllava.llava_image_embed_make_with_filename.restype = POINTER(llava_image_embed) + +# LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); +# /** free an embedding made with llava_image_embed_make_* */ +def llava_image_embed_free(embed: "_Pointer[llava_image_embed]"): + return _libllava.llava_image_embed_free(embed) + +_libllava.llava_image_embed_free.argtypes = [POINTER(llava_image_embed)] +_libllava.llava_image_embed_free.restype = None + +# /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ +# LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); +def llava_eval_image_embed(ctx_llama: llama_cpp.llama_context_p, embed: "_Pointer[llava_image_embed]", n_batch: Union[c_int, int], n_past: Union[c_int, int]) -> bool: + return _libllava.llava_eval_image_embed(ctx_llama, embed, n_batch, n_past) + +_libllava.llava_eval_image_embed.argtypes = [llama_cpp.llama_context_p, POINTER(llava_image_embed), c_int, POINTER(c_int)] +_libllava.llava_eval_image_embed.restype = c_bool + + +################################################ +# clip.h +################################################ + + +# struct clip_vision_hparams { +# int32_t image_size; +# int32_t patch_size; +# int32_t hidden_size; +# int32_t n_intermediate; +# int32_t projection_dim; +# int32_t n_head; +# int32_t n_layer; +# float eps; +# }; +class clip_vision_hparams(Structure): + _fields_ = [ + ("image_size", c_int32), + ("patch_size", c_int32), + ("hidden_size", c_int32), + ("n_intermediate", c_int32), + ("projection_dim", c_int32), + ("n_head", c_int32), + ("n_layer", c_int32), + ("eps", c_float), + ] + +# /** load mmproj model */ +# CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity); +def clip_model_load(fname: bytes, verbosity: Union[c_int, int]) -> clip_ctx_p: + return _libllava.clip_model_load(fname, verbosity) + +_libllava.clip_model_load.argtypes = [c_char_p, c_int] +_libllava.clip_model_load.restype = clip_ctx_p + +# /** free mmproj model */ +# CLIP_API void clip_free(struct clip_ctx * ctx); +def clip_free(ctx: clip_ctx_p): + return _libllava.clip_free(ctx) + +_libllava.clip_free.argtypes = [clip_ctx_p] +_libllava.clip_free.restype = None + +# size_t clip_embd_nbytes(const struct clip_ctx * ctx); +# int clip_n_patches(const struct clip_ctx * ctx); +# int clip_n_mmproj_embd(const struct clip_ctx * ctx); + +# // RGB uint8 image +# struct clip_image_u8 { +# int nx; +# int ny; +# uint8_t * data = NULL; +# size_t size; +# }; + +# // RGB float32 image (NHWC) +# // Memory layout: RGBRGBRGB... +# struct clip_image_f32 { +# int nx; +# int ny; +# float * data = NULL; +# size_t size; +# }; + +# struct clip_image_u8_batch { +# struct clip_image_u8 * data; +# size_t size; +# }; + +# struct clip_image_f32_batch { +# struct clip_image_f32 * data; +# size_t size; +# }; + +# struct clip_image_u8 * make_clip_image_u8(); +# struct clip_image_f32 * make_clip_image_f32(); +# CLIP_API void clip_image_u8_free(clip_image_u8 * img); +# CLIP_API void clip_image_f32_free(clip_image_f32 * img); +# CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); +# /** interpret bytes as an image file with length bytes_length, and use the result to populate img */ +# CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); + +# bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square); +# bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec); + +# bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs, +# float * vec); + +# bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype); \ No newline at end of file From 9406d631e02f53badd31199f82681f0dfc3296bc Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 6 Nov 2023 13:55:28 -0500 Subject: [PATCH 07/26] Revert changes to llama and llama_cpp --- llama_cpp/llama.py | 935 +++++++++++++++++++++++++++-------------- llama_cpp/llama_cpp.py | 69 +-- 2 files changed, 632 insertions(+), 372 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index ba70f2060..6dc113ac9 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -208,6 +208,506 @@ def __call__( return any([stopping_criteria(input_ids, logits) for stopping_criteria in self]) +class _LlamaModel: + """Intermediate Python wrapper for a llama.cpp llama_model. + + NOTE: For stability it's recommended you use the Llama class instead.""" + + _llama_free_model = llama_cpp._lib.llama_free_model # type: ignore + + def __init__( + self, + *, + path_model: str, + params: llama_cpp.llama_model_params, + verbose: bool = True, + ): + self.path_model = path_model + self.params = params + self.verbose = verbose + + if not os.path.exists(path_model): + raise ValueError(f"Model path does not exist: {path_model}") + + with suppress_stdout_stderr(disable=self.verbose): + self.model = llama_cpp.llama_load_model_from_file( + self.path_model.encode("utf-8"), self.params + ) + + def __del__(self): + with suppress_stdout_stderr(disable=self.verbose): + if self.model is not None: + self._llama_free_model(self.model) + self.model = None + + def vocab_type(self) -> int: + assert self.model is not None + return llama_cpp.llama_vocab_type(self.model) + + def n_vocab(self) -> int: + assert self.model is not None + return llama_cpp.llama_n_vocab(self.model) + + def n_ctx_train(self) -> int: + assert self.model is not None + return llama_cpp.llama_n_ctx_train(self.model) + + def n_embd(self) -> int: + assert self.model is not None + return llama_cpp.llama_n_embd(self.model) + + def rope_freq_scale_train(self) -> float: + assert self.model is not None + return llama_cpp.llama_rope_freq_scale_train(self.model) + + def desc(self) -> str: + assert self.model is not None + buf = ctypes.create_string_buffer(1024) + llama_cpp.llama_model_desc(self.model, buf, 1024) # type: ignore + return buf.value.decode("utf-8") + + def size(self) -> int: + assert self.model is not None + return llama_cpp.llama_model_size(self.model) + + def n_params(self) -> int: + assert self.model is not None + return llama_cpp.llama_model_n_params(self.model) + + def get_tensor(self, name: str) -> ctypes.c_void_p: + assert self.model is not None + return llama_cpp.llama_get_model_tensor(self.model, name.encode("utf-8")) + + def apply_lora_from_file( + self, + lora_path: str, + scale: float, + path_base_model: Optional[str], + n_threads: int, + ): + assert self.model is not None + return llama_cpp.llama_model_apply_lora_from_file( + self.model, + lora_path.encode("utf-8"), + scale, + path_base_model.encode("utf-8") + if path_base_model is not None + else llama_cpp.c_char_p(0), + n_threads, + ) + + # Vocab + + def token_get_text(self, token: int) -> str: + # TODO: Fix + assert self.model is not None + return llama_cpp.llama_token_get_text(self.model, token).decode("utf-8") + + def token_get_score(self, token: int) -> float: + assert self.model is not None + return llama_cpp.llama_token_get_score(self.model, token) + + def token_get_type(self, token: int) -> int: + assert self.model is not None + return llama_cpp.llama_token_get_type(self.model, token) + + # Special tokens + + def token_bos(self) -> int: + assert self.model is not None + return llama_cpp.llama_token_bos(self.model) + + def token_eos(self) -> int: + assert self.model is not None + return llama_cpp.llama_token_eos(self.model) + + def token_nl(self) -> int: + assert self.model is not None + return llama_cpp.llama_token_nl(self.model) + + def token_prefix(self) -> int: + assert self.model is not None + return llama_cpp.llama_token_prefix(self.model) + + def token_middle(self) -> int: + assert self.model is not None + return llama_cpp.llama_token_middle(self.model) + + def token_suffix(self) -> int: + assert self.model is not None + return llama_cpp.llama_token_suffix(self.model) + + def token_eot(self) -> int: + assert self.model is not None + return llama_cpp.llama_token_eot(self.model) + + # Tokenization + + def tokenize(self, text: bytes, add_bos: bool, special: bool): + assert self.model is not None + n_ctx = self.n_ctx_train() + tokens = (llama_cpp.llama_token * n_ctx)() + n_tokens = llama_cpp.llama_tokenize( + self.model, text, len(text), tokens, n_ctx, add_bos, special + ) + if n_tokens < 0: + n_tokens = abs(n_tokens) + tokens = (llama_cpp.llama_token * n_tokens)() + n_tokens = llama_cpp.llama_tokenize( + self.model, text, len(text), tokens, n_tokens, add_bos, special + ) + if n_tokens < 0: + raise RuntimeError( + f'Failed to tokenize: text="{text}" n_tokens={n_tokens}' + ) + return list(tokens[:n_tokens]) + + def token_to_piece(self, token: int) -> bytes: + assert self.model is not None + buf = ctypes.create_string_buffer(32) + llama_cpp.llama_token_to_piece(self.model, token, buf, 32) # type: ignore + return bytes(buf) + + def detokenize(self, tokens: List[int]) -> bytes: + assert self.model is not None + output = b"" + size = 32 + buffer = (ctypes.c_char * size)() + for token in tokens: + n = llama_cpp.llama_token_to_piece( + self.model, llama_cpp.llama_token(token), buffer, size + ) + assert n <= size + output += bytes(buffer[:n]) + # NOTE: Llama1 models automatically added a space at the start of the prompt + # this line removes a leading space if the first token is a beginning of sentence token + return ( + output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output + ) + + @staticmethod + def default_params(): + """Get the default llama_model_params.""" + return llama_cpp.llama_model_default_params() + + +class _LlamaContext: + """Intermediate Python wrapper for a llama.cpp llama_context. + + NOTE: For stability it's recommended you use the Llama class instead.""" + + _llama_free = llama_cpp._lib.llama_free # type: ignore + + def __init__( + self, + *, + model: _LlamaModel, + params: llama_cpp.llama_context_params, + verbose: bool = True, + ): + self.model = model + self.params = params + self.verbose = verbose + + with suppress_stdout_stderr(disable=self.verbose): + self.ctx = llama_cpp.llama_new_context_with_model( + self.model.model, self.params + ) + + def __del__(self): + with suppress_stdout_stderr(disable=self.verbose): + if self.ctx is not None: + self._llama_free(self.ctx) + self.ctx = None + + def n_ctx(self) -> int: + assert self.ctx is not None + return llama_cpp.llama_n_ctx(self.ctx) + + def kv_cache_clear(self): + assert self.ctx is not None + llama_cpp.llama_kv_cache_clear(self.ctx) + + def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int): + assert self.ctx is not None + llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1) + + def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int): + assert self.ctx is not None + llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1) + + def kv_cache_seq_keep(self, seq_id: int): + assert self.ctx is not None + llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id) + + def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int): + assert self.ctx is not None + llama_cpp.llama_kv_cache_seq_shift(self.ctx, seq_id, p0, p1, shift) + + def get_state_size(self) -> int: + assert self.ctx is not None + return llama_cpp.llama_get_state_size(self.ctx) + + # TODO: copy_state_data + + # TODO: set_state_data + + # TODO: llama_load_session_file + + # TODO: llama_save_session_file + + def decode(self, batch: "_LlamaBatch"): + assert self.ctx is not None + assert batch.batch is not None + return_code = llama_cpp.llama_decode( + ctx=self.ctx, + batch=batch.batch, + ) + if return_code != 0: + raise RuntimeError(f"llama_decode returned {return_code}") + + def set_n_threads(self, n_threads: int, n_threads_batch: int): + assert self.ctx is not None + llama_cpp.llama_set_n_threads(self.ctx, n_threads, n_threads_batch) + + def get_logits(self): + assert self.ctx is not None + return llama_cpp.llama_get_logits(self.ctx) + + def get_logits_ith(self, i: int): + assert self.ctx is not None + return llama_cpp.llama_get_logits_ith(self.ctx, i) + + def get_embeddings(self): + assert self.ctx is not None + return llama_cpp.llama_get_embeddings(self.ctx) + + # Sampling functions + + def set_rng_seed(self, seed: int): + assert self.ctx is not None + llama_cpp.llama_set_rng_seed(self.ctx, seed) + + def sample_repetition_penalties( + self, + candidates: "_LlamaTokenDataArray", + last_tokens_data: "llama_cpp.Array[llama_cpp.llama_token]", + penalty_last_n: int, + penalty_repeat: float, + penalty_freq: float, + penalty_present: float, + ): + assert self.ctx is not None + llama_cpp.llama_sample_repetition_penalties( + self.ctx, + ctypes.byref(candidates.candidates), # type: ignore + last_tokens_data, + penalty_last_n, + penalty_repeat, + penalty_freq, + penalty_present, + ) + + def sample_classifier_free_guidance( + self, + candidates: "_LlamaTokenDataArray", + guidance_ctx: "_LlamaContext", + scale: float, + ): + assert self.ctx is not None + assert guidance_ctx.ctx is not None + llama_cpp.llama_sample_classifier_free_guidance( + self.ctx, + ctypes.byref(candidates.candidates), # type: ignore + guidance_ctx.ctx, + scale, + ) + + def sample_softmax(self, candidates: "_LlamaTokenDataArray"): + assert self.ctx is not None + llama_cpp.llama_sample_softmax( + self.ctx, + ctypes.byref(candidates.candidates), # type: ignore + ) + + def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int): + assert self.ctx is not None + llama_cpp.llama_sample_top_k( + self.ctx, ctypes.byref(candidates.candidates), k, min_keep # type: ignore + ) + + def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): + assert self.ctx is not None + llama_cpp.llama_sample_top_p( + self.ctx, ctypes.byref(candidates.candidates), p, min_keep # type: ignore + ) + + def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): + assert self.ctx is not None + llama_cpp.llama_sample_min_p( + self.ctx, ctypes.byref(candidates.candidates), p, min_keep # type: ignore + ) + + def sample_tail_free( + self, candidates: "_LlamaTokenDataArray", z: float, min_keep: int + ): + assert self.ctx is not None + llama_cpp.llama_sample_tail_free( + self.ctx, ctypes.byref(candidates.candidates), z, min_keep # type: ignore + ) + + def sample_typical( + self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int + ): + assert self.ctx is not None + llama_cpp.llama_sample_typical( + self.ctx, ctypes.byref(candidates.candidates), p, min_keep # type: ignore + ) + + def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float): + assert self.ctx is not None + llama_cpp.llama_sample_temp( + self.ctx, ctypes.byref(candidates.candidates), temp # type: ignore + ) + + def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar): + assert self.ctx is not None + assert grammar.grammar is not None + llama_cpp.llama_sample_grammar( + self.ctx, + ctypes.byref(candidates.candidates), # type: ignore + grammar.grammar, + ) + + def sample_token_mirostat( + self, + candidates: "_LlamaTokenDataArray", + tau: float, + eta: float, + m: int, + mu: float, + ) -> int: + assert self.ctx is not None + return llama_cpp.llama_sample_token_mirostat( + self.ctx, + ctypes.byref(candidates.candidates), # type: ignore + tau, + eta, + m, + ctypes.pointer(ctypes.c_float(mu)), + ) + + def sample_token_mirostat_v2( + self, candidates: "_LlamaTokenDataArray", tau: float, eta: float, mu: float + ) -> int: + assert self.ctx is not None + return llama_cpp.llama_sample_token_mirostat_v2( + self.ctx, + ctypes.byref(candidates.candidates), # type: ignore + tau, + eta, + ctypes.pointer(ctypes.c_float(mu)), + ) + + def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int: + assert self.ctx is not None + return llama_cpp.llama_sample_token_greedy( + self.ctx, + ctypes.byref(candidates.candidates), # type: ignore + ) + + def sample_token(self, candidates: "_LlamaTokenDataArray") -> int: + assert self.ctx is not None + return llama_cpp.llama_sample_token( + self.ctx, + ctypes.byref(candidates.candidates), # type: ignore + ) + + # Grammar + def grammar_accept_token(self, grammar: LlamaGrammar, token: int): + assert self.ctx is not None + assert grammar.grammar is not None + llama_cpp.llama_grammar_accept_token(self.ctx, grammar.grammar, token) + + def reset_timings(self): + assert self.ctx is not None + llama_cpp.llama_reset_timings(self.ctx) + + def print_timings(self): + assert self.ctx is not None + llama_cpp.llama_print_timings(self.ctx) + + # Utility functions + @staticmethod + def default_params(): + """Get the default llama_context_params.""" + return llama_cpp.llama_context_default_params() + + +class _LlamaBatch: + _llama_batch_free = llama_cpp._lib.llama_batch_free # type: ignore + + def __init__( + self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True + ): + self.n_tokens = n_tokens + self.embd = embd + self.n_seq_max = n_seq_max + self.verbose = verbose + + with suppress_stdout_stderr(disable=self.verbose): + self.batch = llama_cpp.llama_batch_init( + self.n_tokens, self.embd, self.n_seq_max + ) + + def __del__(self): + with suppress_stdout_stderr(disable=self.verbose): + if self.batch is not None: + self._llama_batch_free(self.batch) + self.batch = None + + def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool): + assert self.batch is not None + n_tokens = len(batch) + self.batch.n_tokens = n_tokens + for i in range(n_tokens): + self.batch.token[i] = batch[i] + self.batch.pos[i] = n_past + i + self.batch.seq_id[i][0] = 0 + self.batch.n_seq_id[i] = 1 + self.batch.logits[i] = logits_all + self.batch.logits[n_tokens - 1] = True + + +class _LlamaTokenDataArray: + def __init__(self, *, n_vocab: int): + self.n_vocab = n_vocab + self.candidates_data = np.array( + [], + dtype=np.dtype( + [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True + ), + ) + self.candidates_data.resize(3, self.n_vocab, refcheck=False) + self.candidates = llama_cpp.llama_token_data_array( + data=self.candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p), + size=self.n_vocab, + sorted=False, + ) + self.default_candidates_data_id = np.arange(self.n_vocab, dtype=np.intc) + self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single) + + def copy_logits(self, logits: npt.NDArray[np.single]): + self.candidates_data["id"][:] = self.default_candidates_data_id + self.candidates_data["logit"][:] = logits + self.candidates_data["p"][:] = self.default_candidates_data_p + self.candidates.data = self.candidates_data.ctypes.data_as( + llama_cpp.llama_token_data_p + ) + self.candidates.sorted = llama_cpp.c_bool(False) + self.candidates.size = llama_cpp.c_size_t(self.n_vocab) + + class Llama: """High-level Python wrapper for a llama.cpp model.""" @@ -248,8 +748,6 @@ def __init__( lora_base: Optional[str] = None, lora_scale: float = 1.0, lora_path: Optional[str] = None, - # Multimodal Params - model_mproj_path: str = None, # Backend Params numa: bool = False, # Chat Format Params @@ -314,7 +812,9 @@ def __init__( self._p_tensor_split = None if self.tensor_split is not None: if len(self.tensor_split) > llama_cpp.LLAMA_MAX_DEVICES: - raise ValueError(f"Attempt to split tensors that exceed maximum supported devices. Current LLAMA_MAX_DEVICES={llama_cpp.LLAMA_MAX_DEVICES}") + raise ValueError( + f"Attempt to split tensors that exceed maximum supported devices. Current LLAMA_MAX_DEVICES={llama_cpp.LLAMA_MAX_DEVICES}" + ) # Type conversion and expand the list to the length of LLAMA_MAX_DEVICES FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES self._c_tensor_split = FloatArray( @@ -338,7 +838,9 @@ def __init__( self.context_params.n_threads = self.n_threads self.context_params.n_threads_batch = self.n_threads_batch self.context_params.rope_scaling_type = ( - rope_scaling_type if rope_scaling_type is not None else llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED + rope_scaling_type + if rope_scaling_type is not None + else llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED ) self.context_params.rope_freq_base = ( rope_freq_base if rope_freq_base != 0.0 else 0 @@ -358,9 +860,7 @@ def __init__( self.context_params.yarn_beta_slow = ( yarn_beta_slow if yarn_beta_slow != 0.0 else 0 ) - self.context_params.yarn_orig_ctx = ( - yarn_orig_ctx if yarn_orig_ctx != 0 else 0 - ) + self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0 self.context_params.mul_mat_q = mul_mat_q self.context_params.f16_kv = f16_kv self.context_params.logits_all = logits_all @@ -378,32 +878,28 @@ def __init__( if not os.path.exists(model_path): raise ValueError(f"Model path does not exist: {model_path}") - with suppress_stdout_stderr(disable=self.verbose): - self.model = llama_cpp.llama_load_model_from_file( - self.model_path.encode("utf-8"), self.model_params - ) - assert self.model is not None - - with suppress_stdout_stderr(disable=self.verbose): - self.ctx = llama_cpp.llama_new_context_with_model( - self.model, self.context_params - ) + self._model = _LlamaModel( + path_model=self.model_path, params=self.model_params, verbose=self.verbose + ) - assert self.ctx is not None + self._ctx = _LlamaContext( + model=self._model, + params=self.context_params, + verbose=self.verbose, + ) - with suppress_stdout_stderr(disable=self.verbose): - self.batch = llama_cpp.llama_batch_init( - self.n_batch, 0, 1 - ) + self._batch = _LlamaBatch( + n_tokens=self.n_batch, + embd=0, + n_seq_max=self.context_params.n_ctx, + verbose=self.verbose, + ) if self.lora_path: - if llama_cpp.llama_model_apply_lora_from_file( - self.model, - self.lora_path.encode("utf-8"), + if self._model.apply_lora_from_file( + self.lora_path, self.lora_scale, - self.lora_base.encode("utf-8") - if self.lora_base is not None - else llama_cpp.c_char_p(0), + self.lora_base, self.n_threads, ): raise RuntimeError( @@ -417,25 +913,11 @@ def __init__( self._n_vocab = self.n_vocab() self._n_ctx = self.n_ctx() - size = self._n_vocab - sorted = False - self._candidates_data = np.array( - [], - dtype=np.dtype( - [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True - ), - ) - self._candidates_data.resize(3, self._n_vocab, refcheck=False) - candidates = llama_cpp.llama_token_data_array( - data=self._candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p), - size=size, - sorted=sorted, - ) - self._candidates = candidates + self._token_nl = self.token_nl() self._token_eos = self.token_eos() - self._candidates_data_id = np.arange(self._n_vocab, dtype=np.intc) # type: ignore - self._candidates_data_p = np.zeros(self._n_vocab, dtype=np.single) + + self._candidates = _LlamaTokenDataArray(n_vocab=self._n_vocab) self.n_tokens = 0 self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc) @@ -443,6 +925,16 @@ def __init__( (n_ctx, self._n_vocab), dtype=np.single ) + @property + def ctx(self) -> llama_cpp.llama_context_p: + assert self._ctx.ctx is not None + return self._ctx.ctx + + @property + def model(self) -> llama_cpp.llama_model_p: + assert self._model.model is not None + return self._model.model + @property def _input_ids(self) -> npt.NDArray[np.intc]: return self.input_ids[: self.n_tokens] @@ -462,7 +954,9 @@ def eval_logits(self) -> Deque[List[float]]: maxlen=self._n_ctx if self.context_params.logits_all else 1, ) - def tokenize(self, text: bytes, add_bos: bool = True, special: bool = False) -> List[int]: + def tokenize( + self, text: bytes, add_bos: bool = True, special: bool = False + ) -> List[int]: """Tokenize a string. Args: @@ -474,35 +968,7 @@ def tokenize(self, text: bytes, add_bos: bool = True, special: bool = False) -> Returns: A list of tokens. """ - assert self.model is not None - n_ctx = self._n_ctx - tokens = (llama_cpp.llama_token * n_ctx)() - n_tokens = llama_cpp.llama_tokenize( - self.model, - text, - len(text), - tokens, - n_ctx, - add_bos, - special - ) - if n_tokens < 0: - n_tokens = abs(n_tokens) - tokens = (llama_cpp.llama_token * n_tokens)() - n_tokens = llama_cpp.llama_tokenize( - self.model, - text, - len(text), - tokens, - n_tokens, - add_bos, - special - ) - if n_tokens < 0: - raise RuntimeError( - f'Failed to tokenize: text="{text}" n_tokens={n_tokens}' - ) - return list(tokens[:n_tokens]) + return self._model.tokenize(text, add_bos, special) def detokenize(self, tokens: List[int]) -> bytes: """Detokenize a list of tokens. @@ -513,21 +979,7 @@ def detokenize(self, tokens: List[int]) -> bytes: Returns: The detokenized string. """ - assert self.model is not None - output = b"" - size = 32 - buffer = (ctypes.c_char * size)() - for token in tokens: - n = llama_cpp.llama_token_to_piece( - self.model, llama_cpp.llama_token(token), buffer, size - ) - assert n <= size - output += bytes(buffer[:n]) - # NOTE: Llama1 models automatically added a space at the start of the prompt - # this line removes a leading space if the first token is a beginning of sentence token - return ( - output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output - ) + return self._model.detokenize(tokens) def set_cache(self, cache: Optional[BaseLlamaCache]): """Set the cache. @@ -547,28 +999,18 @@ def eval(self, tokens: Sequence[int]): Args: tokens: The list of tokens to evaluate. """ - assert self.ctx is not None - assert self.batch is not None + assert self._ctx.ctx is not None + assert self._batch.batch is not None n_ctx = self._n_ctx for i in range(0, len(tokens), self.n_batch): batch = tokens[i : min(len(tokens), i + self.n_batch)] - n_past = min(n_ctx - len(batch), len(self._input_ids)) + n_past = min(n_ctx - len(batch), self.n_tokens) n_tokens = len(batch) - llama_cpp.llama_kv_cache_seq_rm(self.ctx, -1, n_past, -1) - self.batch.n_tokens = n_tokens - for i in range(n_tokens): - self.batch.token[i] = batch[i] - self.batch.pos[i] = n_past + i - self.batch.seq_id[i][0] = 0 - self.batch.n_seq_id[i] = 1 - self.batch.logits[i] = True if self.context_params.logits_all else False - self.batch.logits[n_tokens - 1] = True - return_code = llama_cpp.llama_decode( - ctx=self.ctx, - batch=self.batch, + self._ctx.kv_cache_seq_rm(-1, n_past, -1) + self._batch.set_batch( + batch=batch, n_past=n_past, logits_all=self.context_params.logits_all ) - if return_code != 0: - raise RuntimeError(f"llama_decode returned {return_code}") + self._ctx.decode(self._batch) # Save tokens self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch # Save logits @@ -579,195 +1021,106 @@ def eval(self, tokens: Sequence[int]): ) # NOTE: Only save the last token logits if logits_all is False self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape( -1 - )[:] = llama_cpp.llama_get_logits(self.ctx)[: rows * cols] + )[:] = self._ctx.get_logits()[: rows * cols] # Update n_tokens self.n_tokens += n_tokens - def _sample( + def sample( self, - last_n_tokens_data, # type: llama_cpp.Array[llama_cpp.llama_token] - last_n_tokens_size: int, - top_k: int, - top_p: float, - temp: float, - tfs_z: float, - repeat_penalty: float, - frequency_penalty: float, - presence_penalty: float, - mirostat_mode: float, - mirostat_tau: float, - mirostat_eta: float, + top_k: int = 40, + top_p: float = 0.95, + temp: float = 0.80, + repeat_penalty: float = 1.1, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_eta: float = 0.1, + mirostat_tau: float = 5.0, penalize_nl: bool = True, logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, ): - assert self.ctx is not None + """Sample a token from the model. + + Args: + top_k: The top-k sampling parameter. + top_p: The top-p sampling parameter. + temp: The temperature parameter. + repeat_penalty: The repeat penalty parameter. + + Returns: + The sampled token. + """ + assert self._ctx is not None assert self.n_tokens > 0 + last_n_tokens_data = [llama_cpp.llama_token(0)] * max( + 0, self.last_n_tokens_size - self.n_tokens + ) + self._input_ids[-self.last_n_tokens_size :].tolist() + last_n_tokens_size = len(last_n_tokens_data) n_vocab = self._n_vocab n_ctx = self._n_ctx top_k = n_vocab if top_k <= 0 else top_k last_n_tokens_size = n_ctx if last_n_tokens_size < 0 else last_n_tokens_size + last_n_tokens_data_c = (llama_cpp.llama_token * last_n_tokens_size)( + *last_n_tokens_data + ) logits: npt.NDArray[np.single] = self._scores[-1, :] if logits_processor is not None: logits[:] = logits_processor(self._input_ids, logits) nl_logit = logits[self._token_nl] - candidates = self._candidates - candidates_data = self._candidates_data - candidates_data["id"][:] = self._candidates_data_id # type: ignore - candidates_data["logit"][:] = logits - candidates_data["p"][:] = self._candidates_data_p # type: ignore - candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p) - candidates.sorted = llama_cpp.c_bool(False) - candidates.size = llama_cpp.c_size_t(n_vocab) - llama_cpp.llama_sample_repetition_penalties( - ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore - last_tokens_data=last_n_tokens_data, + self._candidates.copy_logits(logits) + self._ctx.sample_repetition_penalties( + candidates=self._candidates, + last_tokens_data=last_n_tokens_data_c, penalty_last_n=last_n_tokens_size, penalty_repeat=repeat_penalty, penalty_freq=frequency_penalty, penalty_present=presence_penalty, ) if not penalize_nl: - candidates.data[self._token_nl].logit = llama_cpp.c_float(nl_logit) + self._candidates.candidates.data[self._token_nl].logit = llama_cpp.c_float( + nl_logit + ) if grammar is not None: - llama_cpp.llama_sample_grammar( - ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore - grammar=grammar.grammar, + self._ctx.sample_grammar( + candidates=self._candidates, + grammar=grammar, ) if temp == 0.0: - id = llama_cpp.llama_sample_token_greedy( - ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore - ) + id = self._ctx.sample_token_greedy(candidates=self._candidates) elif mirostat_mode == 1: - mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau) - mirostat_m = llama_cpp.c_int(100) - llama_cpp.llama_sample_temperature( - ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore - temp=temp, - ) - id = llama_cpp.llama_sample_token_mirostat( - ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + self._ctx.sample_temp(candidates=self._candidates, temp=temp) + id = self._ctx.sample_token_mirostat( + candidates=self._candidates, tau=mirostat_tau, eta=mirostat_eta, - mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore - m=mirostat_m, + mu=2.0 * mirostat_tau, + m=100, ) elif mirostat_mode == 2: - mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau) - llama_cpp.llama_sample_temperature( - ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore - temp=temp, - ) - id = llama_cpp.llama_sample_token_mirostat_v2( - ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + self._ctx.sample_temp(candidates=self._candidates, temp=temp) + id = self._ctx.sample_token_mirostat_v2( + candidates=self._candidates, tau=mirostat_tau, eta=mirostat_eta, - mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore + mu=2.0 * mirostat_tau, ) else: - llama_cpp.llama_sample_top_k( - ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore - k=top_k, - min_keep=llama_cpp.c_size_t(1), - ) - llama_cpp.llama_sample_tail_free( - ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore - z=tfs_z, - min_keep=llama_cpp.c_size_t(1), - ) - llama_cpp.llama_sample_typical( - ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore - p=llama_cpp.c_float(1.0), - min_keep=llama_cpp.c_size_t(1), - ) - llama_cpp.llama_sample_top_p( - ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore - p=top_p, - min_keep=llama_cpp.c_size_t(1), - ) - llama_cpp.llama_sample_temperature( - ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore - temp=temp, - ) - id = llama_cpp.llama_sample_token( - ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore - ) + self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1) + self._ctx.sample_tail_free(candidates=self._candidates, z=tfs_z, min_keep=1) + self._ctx.sample_typical(candidates=self._candidates, p=1.0, min_keep=1) + self._ctx.sample_top_p(candidates=self._candidates, p=top_p, min_keep=1) + self._ctx.sample_temp(candidates=self._candidates, temp=temp) + id = self._ctx.sample_token(candidates=self._candidates) if grammar is not None: - llama_cpp.llama_grammar_accept_token( - ctx=self.ctx, - grammar=grammar.grammar, - token=llama_cpp.ctypes.c_int(id), - ) + self._ctx.grammar_accept_token(grammar=grammar, token=id) return id - def sample( - self, - top_k: int = 40, - top_p: float = 0.95, - temp: float = 0.80, - repeat_penalty: float = 1.1, - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_eta: float = 0.1, - mirostat_tau: float = 5.0, - penalize_nl: bool = True, - logits_processor: Optional[LogitsProcessorList] = None, - grammar: Optional[LlamaGrammar] = None, - ): - """Sample a token from the model. - - Args: - top_k: The top-k sampling parameter. - top_p: The top-p sampling parameter. - temp: The temperature parameter. - repeat_penalty: The repeat penalty parameter. - - Returns: - The sampled token. - """ - assert self.ctx is not None - last_n_tokens_data = [llama_cpp.llama_token(0)] * max( - 0, self.last_n_tokens_size - len(self._input_ids) - ) + self._input_ids[-self.last_n_tokens_size :].tolist() - return self._sample( - last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)( - *last_n_tokens_data - ), - last_n_tokens_size=self.last_n_tokens_size, - top_k=top_k, - top_p=top_p, - temp=temp, - tfs_z=tfs_z, - repeat_penalty=repeat_penalty, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - penalize_nl=penalize_nl, - logits_processor=logits_processor, - grammar=grammar, - ) - def generate( self, tokens: Sequence[int], @@ -805,8 +1158,7 @@ def generate( Yields: The generated tokens. """ - assert self.ctx is not None - if reset and len(self._input_ids) > 0: + if reset and self.n_tokens > 0: longest_prefix = 0 for a, b in zip(self._input_ids, tokens[:-1]): if a == b: @@ -862,8 +1214,8 @@ def create_embedding( Returns: An embedding object. """ - assert self.ctx is not None - assert self.model is not None + assert self._ctx.ctx is not None + assert self._model.model is not None model_name: str = model if model is not None else self.model_path if self.context_params.embedding == False: @@ -872,7 +1224,7 @@ def create_embedding( ) if self.verbose: - llama_cpp.llama_reset_timings(self.ctx) + llama_cpp.llama_reset_timings(self._ctx.ctx) if isinstance(input, str): inputs = [input] @@ -887,8 +1239,8 @@ def create_embedding( self.eval(tokens) n_tokens = len(tokens) total_tokens += n_tokens - embedding = llama_cpp.llama_get_embeddings(self.ctx)[ - : llama_cpp.llama_n_embd(self.model) + embedding = llama_cpp.llama_get_embeddings(self._ctx.ctx)[ + : llama_cpp.llama_n_embd(self._model.model) ] data.append( @@ -899,7 +1251,7 @@ def create_embedding( } ) if self.verbose: - llama_cpp.llama_print_timings(self.ctx) + llama_cpp.llama_print_timings(self._ctx.ctx) return { "object": "list", @@ -946,7 +1298,7 @@ def _create_completion( logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: - assert self.ctx is not None + assert self._ctx is not None assert suffix is None or suffix.__class__ is str completion_id: str = f"cmpl-{str(uuid.uuid4())}" @@ -966,16 +1318,16 @@ def _create_completion( model_name: str = model if model is not None else self.model_path if self.verbose: - llama_cpp.llama_reset_timings(self.ctx) + self._ctx.reset_timings() - if len(prompt_tokens) >= llama_cpp.llama_n_ctx(self.ctx): + if len(prompt_tokens) >= self._n_ctx: raise ValueError( - f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}" + f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self._ctx)}" ) if max_tokens <= 0: # Unlimited, depending on n_ctx. - max_tokens = llama_cpp.llama_n_ctx(self.ctx) - len(prompt_tokens) + max_tokens = self._n_ctx - len(prompt_tokens) # Truncate max_tokens if requested tokens would exceed the context window max_tokens = ( @@ -1186,7 +1538,7 @@ def _create_completion( finish_reason = "stop" if self.verbose: - llama_cpp.llama_print_timings(self.ctx) + self._ctx.print_timings() if stream: remaining_tokens = completion_tokens[returned_tokens:] @@ -1584,24 +1936,6 @@ def create_chat_completion( grammar=grammar, ) - def _free_model(self, *, _lbatch_free=llama_cpp._lib.llama_batch_free, _lfree_model=llama_cpp._lib.llama_free_model, _free=llama_cpp._lib.llama_free): - batch = getattr(self, 'batch', None) - if batch is not None: - _lbatch_free(batch) - self.batch = None - model = getattr(self, 'model', None) - if model is not None: - _lfree_model(model) - self.model = None - ctx = getattr(self, 'ctx', None) - if ctx is not None: - _free(ctx) - self.ctx = None - - def __del__(self): - with suppress_stdout_stderr(disable=self.verbose): - self._free_model() - def __getstate__(self): return dict( model_path=self.model_path, @@ -1686,16 +2020,16 @@ def __setstate__(self, state): ) def save_state(self) -> LlamaState: - assert self.ctx is not None + assert self._ctx.ctx is not None if self.verbose: print("Llama.save_state: saving llama state", file=sys.stderr) - state_size = llama_cpp.llama_get_state_size(self.ctx) + state_size = llama_cpp.llama_get_state_size(self._ctx.ctx) if self.verbose: print(f"Llama.save_state: got state size: {state_size}", file=sys.stderr) llama_state = (llama_cpp.c_uint8 * int(state_size))() if self.verbose: print("Llama.save_state: allocated state", file=sys.stderr) - n_bytes = llama_cpp.llama_copy_state_data(self.ctx, llama_state) + n_bytes = llama_cpp.llama_copy_state_data(self._ctx.ctx, llama_state) if self.verbose: print(f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr) if int(n_bytes) > int(state_size): @@ -1716,7 +2050,7 @@ def save_state(self) -> LlamaState: ) def load_state(self, state: LlamaState) -> None: - assert self.ctx is not None + assert self._ctx.ctx is not None self.scores = state.scores.copy() self.input_ids = state.input_ids.copy() self.n_tokens = state.n_tokens @@ -1724,43 +2058,36 @@ def load_state(self, state: LlamaState) -> None: LLamaStateArrayType = llama_cpp.c_uint8 * state_size llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state) - if llama_cpp.llama_set_state_data(self.ctx, llama_state) != state_size: + if llama_cpp.llama_set_state_data(self._ctx.ctx, llama_state) != state_size: raise RuntimeError("Failed to set llama state data") def n_ctx(self) -> int: """Return the context window size.""" - assert self.ctx is not None - return llama_cpp.llama_n_ctx(self.ctx) + return self._ctx.n_ctx() def n_embd(self) -> int: """Return the embedding size.""" - assert self.model is not None - return llama_cpp.llama_n_embd(self.model) + return self._model.n_embd() def n_vocab(self) -> int: """Return the vocabulary size.""" - assert self.model is not None - return llama_cpp.llama_n_vocab(self.model) + return self._model.n_vocab() def tokenizer(self) -> "LlamaTokenizer": """Return the tokenizer for this model.""" - assert self.ctx is not None return LlamaTokenizer(self) def token_eos(self) -> int: """Return the end-of-sequence token.""" - assert self.model is not None - return llama_cpp.llama_token_eos(self.model) + return self._model.token_eos() def token_bos(self) -> int: """Return the beginning-of-sequence token.""" - assert self.model is not None - return llama_cpp.llama_token_bos(self.model) + return self._model.token_bos() def token_nl(self) -> int: """Return the newline token.""" - assert self.model is not None - return llama_cpp.llama_token_nl(self.model) + return self._model.token_nl() @staticmethod def logits_to_logprobs(logits: List[float]) -> List[float]: diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 8e683d113..a4d21004f 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -83,8 +83,6 @@ def _load_shared_library(lib_base_name: str): # Misc c_float_p = POINTER(c_float) -c_float_p_p = POINTER(POINTER(c_float)) -c_int_p = POINTER(c_int) c_uint8_p = POINTER(c_uint8) c_size_t_p = POINTER(c_size_t) @@ -115,11 +113,6 @@ def _load_shared_library(lib_base_name: str): # struct llama_context; llama_context_p = c_void_p -# struct clip_ctx; -clip_ctx_p = c_void_p - -# struct llava_image_embed; -llava_image_embed_p = c_void_p; # typedef int32_t llama_pos; llama_pos = c_int32 @@ -1085,7 +1078,7 @@ def llama_batch_get_one( tokens, # type: Array[llama_token] n_tokens: Union[c_int, int], pos_0: Union[llama_pos, int], - seq_id: Union[llama_seq_id, int], + seq_id: llama_seq_id, ) -> llama_batch: return _lib.llama_batch_get_one(tokens, n_tokens, pos_0, seq_id) @@ -1969,63 +1962,3 @@ def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p): _lib.llama_dump_timing_info_yaml.argtypes = [ctypes.c_void_p, llama_context_p] _lib.llama_dump_timing_info_yaml.restype = None - - -# LLAVA - - -# LLAMA_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity); -def clip_model_load(fname: Union[c_char_p, bytes], verbosity: c_int = 0) -> clip_ctx_p: - """ load mmproj model """ - return _lib.clip_model_load(fname, verbosity) -_lib.clip_model_load.argtypes = [c_char_p, c_int] -_lib.clip_model_load.restype = clip_ctx_p - - -# LLAMA_API void clip_free(struct clip_ctx * ctx); -def clip_free(ctx: clip_ctx_p): - """ free mmproj model """ - _lib.clip_free(ctx) -_lib.clip_free.argtypes = [clip_ctx_p] -_lib.clip_free.restype = None - - -#LLAMA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip); -def llava_validate_embed_size(ctx_llama: llama_context_p, ctx_clip: clip_ctx_p) -> c_bool: - """ sanity check for clip <-> llava embed size match """ - return _lib.llava_validate_embed_size(ctx_llama, ctx_clip) -_lib.llava_validate_embed_size.argtypes = [llama_context_p, clip_ctx_p] -_lib.llava_validate_embed_size.restype = c_bool - - -#LLAMA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); -def llava_image_embed_make_with_bytes(ctx_clip: clip_ctx_p, n_threads: Union[int,c_int], image_bytes: c_uint8_p, image_bytes_length: c_size_t) -> llava_image_embed_p: - """ build an image embed by interpreting image_bytes as the contents of an image file with byte size image_bytes_length. - supported formats (autodetected): JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC (ref https://github.com/nothings/stb) """ - return _lib.llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length) -_lib.llava_image_embed_make_with_bytes.argtypes = [clip_ctx_p, c_int, c_uint8_p, c_size_t] -_lib.llava_image_embed_make_with_bytes.restype = llava_image_embed_p - - -#LLAMA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); -def llava_image_embed_make_with_filename(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], filename: Union[c_char_p, bytes]) -> llava_image_embed_p: - """ build an image embed from a path to an image filename """ - return _lib.llava_image_embed_make_with_filename(ctx_clip, n_threads, filename) -_lib.llava_image_embed_make_with_filename.argtypes = [clip_ctx_p, c_int, c_char_p] -_lib.llava_image_embed_make_with_filename.restype = llava_image_embed_p - -#LLAMA_API void llava_image_embed_free(struct llava_image_embed * embed); -def llava_image_embed_free(embed: llava_image_embed_p): - """ free an embedding made with one of the llava_image_embed_make_ methods """ - _lib.llava_image_embed_free(embed) -_lib.llava_image_embed_free.argtypes = [llava_image_embed_p] -_lib.llava_image_embed_free.restype = None - -#LLAMA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); -def llava_eval_image_embed(ctx: llama_context_p, image_embed: llava_image_embed_p, n_batch: c_int, n_past: c_int_p) -> c_bool: - """ write the image represented by embed into the llama context with batch size n_batch, - starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed.""" - return _lib.llava_eval_image_embed(ctx, image_embed, n_batch, n_past) -_lib.llava_eval_image_embed.argtypes = [llama_context_p, llava_image_embed_p, c_int, c_int_p] -_lib.llava_eval_image_embed.restyle = c_bool - From 82007d0b301f5432b8abcce0021d7f26de58cd4c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 6 Nov 2023 14:01:58 -0500 Subject: [PATCH 08/26] Update llava example --- examples/multimodal/llava.py | 60 ++++++++++++++++++++++++------------ llama_cpp/llava_cpp.py | 2 +- 2 files changed, 41 insertions(+), 21 deletions(-) diff --git a/examples/multimodal/llava.py b/examples/multimodal/llava.py index a209625c1..ac5c33a73 100644 --- a/examples/multimodal/llava.py +++ b/examples/multimodal/llava.py @@ -1,69 +1,90 @@ import ctypes -import json import argparse import os import array import sys -from llama_cpp import (Llama, clip_model_load, llava_image_embed_make_with_filename, llava_image_embed_make_with_bytes, - llava_image_embed_p, llava_image_embed_free, llava_validate_embed_size, llava_eval_image_embed) +from llama_cpp import Llama +from llama_cpp.llava_cpp import ( + clip_model_load, + llava_image_embed_make_with_filename, + llava_image_embed_make_with_bytes, + llava_image_embed_free, + llava_validate_embed_size, + llava_eval_image_embed, +) parser = argparse.ArgumentParser() -parser.add_argument("-m", "--model", type=str, default="../models/llava-v1.5-7b/ggml-model-q5_k.gguf") +parser.add_argument( + "-m", "--model", type=str, default="../models/llava-v1.5-7b/ggml-model-q5_k.gguf" +) parser.add_argument("--mmproj", type=str, default="llava-v1.5-7b/mmproj-model-f16.gguf") parser.add_argument("-t", "--temp", type=float, default=0.1) -parser.add_argument("-p", "--prompt", type=str, default="Describe this image in detail.") +parser.add_argument( + "-p", "--prompt", type=str, default="Describe this image in detail." +) args = parser.parse_args() print(f"loading clip model from {args.mmproj}") if not os.path.exists(args.mmproj): raise FileNotFoundError(args.mmproj) -ctx_clip = clip_model_load(args.mmproj.encode('utf-8')) +ctx_clip = clip_model_load(fname=args.mmproj.encode("utf-8"), verbosity=0) image_path = os.path.join(os.path.dirname(__file__), "overfitting_lc.png") if not os.path.exists(image_path): raise FileNotFoundError(image_path) -image_embed = llava_image_embed_make_with_filename(ctx_clip=ctx_clip, n_threads=1, filename=image_path.encode('utf8')) +image_embed = llava_image_embed_make_with_filename( + ctx_clip=ctx_clip, n_threads=1, image_path=image_path.encode("utf8") +) -def load_image_embed_from_file_bytes(image_path: str) -> llava_image_embed_p: - with open(image_path, 'rb') as file: + +def load_image_embed_from_file_bytes(image_path: str): + with open(image_path, "rb") as file: image_bytes = file.read() bytes_length = len(image_bytes) - data_array = array.array('B', image_bytes) + data_array = array.array("B", image_bytes) c_ubyte_ptr = (ctypes.c_ubyte * len(data_array)).from_buffer(data_array) - return llava_image_embed_make_with_bytes(ctx_clip=ctx_clip, n_threads=1, image_bytes=c_ubyte_ptr, image_bytes_length=bytes_length) + return llava_image_embed_make_with_bytes( + ctx_clip=ctx_clip, + n_threads=1, + image_bytes=c_ubyte_ptr, + image_bytes_length=bytes_length, + ) + print(f"loading llm model from {args.model}") if not os.path.exists(args.model): raise FileNotFoundError(args.model) -llm = Llama(model_path=args.model, n_ctx=2048, n_gpu_layers=1) # longer context needed for image embeds +llm = Llama( + model_path=args.model, n_ctx=2048, n_gpu_layers=1 +) # longer context needed for image embeds if not llava_validate_embed_size(llm.ctx, ctx_clip): raise RuntimeError("llm and mmproj model embed size mismatch") # eval system prompt system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n" -llm.eval(llm.tokenize(system_prompt.encode('utf8'))) -llm.eval(llm.tokenize("\nUSER: ".encode('utf8'))) +llm.eval(llm.tokenize(system_prompt.encode("utf8"))) +llm.eval(llm.tokenize("\nUSER: ".encode("utf8"))) # eval image embed n_past = ctypes.c_int(llm.n_tokens) -n_past_p = ctypes.byref(n_past) +n_past_p = ctypes.pointer(n_past) llava_eval_image_embed(llm.ctx, image_embed, llm.n_batch, n_past_p) llm.n_tokens = n_past.value llava_image_embed_free(image_embed) # eval prompt -prompt = 'Describe the visual content of this image' -llm.eval(llm.tokenize(prompt.encode('utf8'))) -llm.eval(llm.tokenize("\nASSISTANT:".encode('utf8'))) +prompt = "Describe the visual content of this image" +llm.eval(llm.tokenize(prompt.encode("utf8"))) +llm.eval(llm.tokenize("\nASSISTANT:".encode("utf8"))) # get output print("\n") max_target_len = 256 for i in range(max_target_len): t_id = llm.sample(temp=0.1) - t = llm.detokenize([t_id]).decode('utf8') + t = llm.detokenize([t_id]).decode("utf8") if t == "": break print(t, end="") @@ -72,4 +93,3 @@ def load_image_embed_from_file_bytes(image_path: str) -> llava_image_embed_p: print("\n") print("done") - diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py index 5dc4b4201..72f6a1211 100644 --- a/llama_cpp/llava_cpp.py +++ b/llama_cpp/llava_cpp.py @@ -134,7 +134,7 @@ def llava_image_embed_free(embed: "_Pointer[llava_image_embed]"): # /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ # LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); -def llava_eval_image_embed(ctx_llama: llama_cpp.llama_context_p, embed: "_Pointer[llava_image_embed]", n_batch: Union[c_int, int], n_past: Union[c_int, int]) -> bool: +def llava_eval_image_embed(ctx_llama: llama_cpp.llama_context_p, embed: "_Pointer[llava_image_embed]", n_batch: Union[c_int, int], n_past: "_Pointer[c_int]") -> bool: return _libllava.llava_eval_image_embed(ctx_llama, embed, n_batch, n_past) _libllava.llava_eval_image_embed.argtypes = [llama_cpp.llama_context_p, POINTER(llava_image_embed), c_int, POINTER(c_int)] From f6fe6b001cb96beb149ba74e0d8e94422313f374 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 6 Nov 2023 14:46:45 -0500 Subject: [PATCH 09/26] Add types for new gpt-4-vision-preview api --- llama_cpp/llama_types.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index a64033ea0..5d48cede3 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -77,9 +77,25 @@ class ChatCompletionFunctionCall(TypedDict): arguments: str +class _ChatCompletionTextContent(TypedDict): + type: Literal["text"] + text: str + + +class _ChatCompletionImageUrlContentUrl(TypedDict): + url: str + + +class _ChatCompletionImageUrlContent(TypedDict): + type: Literal["image_url"] + image_url: _ChatCompletionImageUrlContentUrl + + class ChatCompletionResponseMessage(TypedDict): role: Literal["assistant", "user", "system", "function"] - content: Optional[str] + content: Optional[ + Union[str, _ChatCompletionTextContent, _ChatCompletionImageUrlContent] + ] user: NotRequired[str] function_call: NotRequired[ChatCompletionFunctionCall] From 39e2be13c3086cc202816bd8e093568991d004cc Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 6 Nov 2023 14:47:54 -0500 Subject: [PATCH 10/26] Fix typo --- llama_cpp/llama_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index 5d48cede3..9d2848552 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -1,4 +1,4 @@ -"""Types and request signatrues for OpenAI compatibility +"""Types and request signatures for OpenAI compatibility Based on the OpenAI OpenAPI specification: https://github.com/openai/openai-openapi/blob/master/openapi.yaml From 7c3009ed5fe723cfc1d7cbc79f604728342d262a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 6 Nov 2023 19:05:55 -0500 Subject: [PATCH 11/26] Update llama.cpp --- .gitmodules | 2 +- vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index 6fe937b38..7edf0975d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "vendor/llama.cpp"] path = vendor/llama.cpp - url = https://github.com/damian0815/llama.cpp.git + url = https://github.com/ggerganov/llama.cpp.git diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 22f43fca0..381efbf48 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 22f43fca0ac2237766f825a8ab4aa2d5e19238d0 +Subproject commit 381efbf480959bb6d1e247a8b0c2328f22e350f8 From 1f1abfdea8d4061a55da84fe667d59e510a8b45f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 7 Nov 2023 00:09:25 -0500 Subject: [PATCH 12/26] Update llama_types to match OpenAI v1 API --- llama_cpp/llama.py | 10 +-- llama_cpp/llama_chat_format.py | 46 +++++++++-- llama_cpp/llama_types.py | 138 +++++++++++++++++++++++---------- llama_cpp/server/app.py | 4 +- 4 files changed, 145 insertions(+), 53 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6dc113ac9..b4242ea04 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1231,7 +1231,7 @@ def create_embedding( else: inputs = input - data: List[EmbeddingData] = [] + data: List[Embedding] = [] total_tokens = 0 for index, input in enumerate(inputs): tokens = self.tokenize(input.encode("utf-8"), special=True) @@ -1297,7 +1297,7 @@ def _create_completion( stopping_criteria: Optional[StoppingCriteriaList] = None, logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, - ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: + ) -> Union[Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]]: assert self._ctx is not None assert suffix is None or suffix.__class__ is str @@ -1753,7 +1753,7 @@ def create_completion( stopping_criteria: Optional[StoppingCriteriaList] = None, logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, - ) -> Union[Completion, Iterator[CompletionChunk]]: + ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """Generate text from a prompt. Args: @@ -1800,7 +1800,7 @@ def create_completion( grammar=grammar, ) if stream: - chunks: Iterator[CompletionChunk] = completion_or_chunks + chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks return chunks completion: Completion = next(completion_or_chunks) # type: ignore return completion @@ -1828,7 +1828,7 @@ def __call__( stopping_criteria: Optional[StoppingCriteriaList] = None, logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, - ) -> Union[Completion, Iterator[CompletionChunk]]: + ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """Generate text from a prompt. Args: diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 903a8c908..30c505a53 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -199,7 +199,7 @@ def _convert_text_completion_to_chat( def _convert_text_completion_chunks_to_chat( - chunks: Iterator[llama_types.CompletionChunk], + chunks: Iterator[llama_types.CreateCompletionStreamResponse], ) -> Iterator[llama_types.ChatCompletionChunk]: for i, chunk in enumerate(chunks): if i == 0: @@ -239,12 +239,12 @@ def _convert_text_completion_chunks_to_chat( def _convert_completion_to_chat( completion_or_chunks: Union[ - llama_types.Completion, Iterator[llama_types.CompletionChunk] + llama_types.CreateCompletionResponse, Iterator[llama_types.CreateCompletionStreamResponse] ], stream: bool = False, -) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: +) -> Union[llama_types.CreateChatCompletionResponse, Iterator[llama_types.ChatCompletionChunk]]: if stream: - chunks: Iterator[llama_types.CompletionChunk] = completion_or_chunks # type: ignore + chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore return _convert_text_completion_chunks_to_chat(chunks) else: completion: llama_types.Completion = completion_or_chunks # type: ignore @@ -613,13 +613,13 @@ def prepare_messages_for_inference( all_messages: List[llama_types.ChatCompletionRequestMessage] = [] if functions is not None: all_messages.append( - llama_types.ChatCompletionRequestMessage( + llama_types.ChatCompletionRequestSystemMessage( role="system", content=generate_schema_from_functions(functions) ) ) all_messages.append( - llama_types.ChatCompletionRequestMessage( + llama_types.ChatCompletionRequestSystemMessage( role="system", content=SYSTEM_MESSAGE ) ) @@ -636,7 +636,7 @@ def prepare_messages_for_inference( all_messages.append(message) all_messages.append( - llama_types.ChatCompletionRequestMessage(role="assistant", content=None) + llama_types.ChatCompletionRequestAssistantMessage(role="assistant", content=None) ) def message_to_str(msg: llama_types.ChatCompletionRequestMessage): @@ -734,3 +734,35 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): ], usage=completion["usage"], ) + + +@register_chat_completion_handler("llava-1.5") +def lava_1_5_chat_handler( + llama: llama.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[Union[str, llama_types.ChatCompletionFunctionCall]] = None, + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + max_tokens: int = 256, + presence_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + logits_processor: Optional[llama.LogitsProcessorList] = None, + grammar: Optional[llama.LlamaGrammar] = None, +) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: + # convert messages into a list of strings and images objects + # for each item in list + # if string, process it and append to prompt + # if image, evaluate it and add empty string to prompt (for now) + # generate completion + items = [] + current_prompt = "" \ No newline at end of file diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index 9d2848552..43c4f081e 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -1,5 +1,7 @@ """Types and request signatures for OpenAI compatibility +NOTE: These types may change to match the OpenAI OpenAPI specification. + Based on the OpenAI OpenAPI specification: https://github.com/openai/openai-openapi/blob/master/openapi.yaml @@ -19,9 +21,6 @@ class Embedding(TypedDict): embedding: List[float] -EmbeddingData = Embedding - - class CreateEmbeddingResponse(TypedDict): object: Literal["list"] model: str @@ -57,9 +56,6 @@ class CreateCompletionStreamResponse(TypedDict): choices: List[CompletionChoice] -CompletionChunk = CreateCompletionStreamResponse - - class CreateCompletionResponse(TypedDict): id: str object: Literal["text_completion"] @@ -69,9 +65,6 @@ class CreateCompletionResponse(TypedDict): usage: CompletionUsage -Completion = CreateCompletionResponse - - class ChatCompletionFunctionCall(TypedDict): name: str arguments: str @@ -100,73 +93,58 @@ class ChatCompletionResponseMessage(TypedDict): function_call: NotRequired[ChatCompletionFunctionCall] -ChatCompletionMessage = ChatCompletionResponseMessage - - class ChatCompletionResponseFunction(TypedDict): name: str description: NotRequired[str] parameters: Dict[str, Any] # TODO: make this more specific -ChatCompletionFunction = ChatCompletionResponseFunction - - class ChatCompletionResponseChoice(TypedDict): index: int - message: ChatCompletionMessage + message: "ChatCompletionMessage" finish_reason: Optional[str] -ChatCompletionChoice = ChatCompletionResponseChoice - - class CreateChatCompletionResponse(TypedDict): id: str object: Literal["chat.completion"] created: int model: str - choices: List[ChatCompletionChoice] + choices: List["ChatCompletionChoice"] usage: CompletionUsage -ChatCompletion = CreateChatCompletionResponse +class ChatCompletionMessageToolCallChunk(TypedDict): + index: int + id: NotRequired[str] + type: Literal["function"] + function: ChatCompletionFunctionCall class ChatCompletionStreamResponseDeltaEmpty(TypedDict): pass -ChatCompletionChunkDeltaEmpty = ChatCompletionStreamResponseDeltaEmpty - - class ChatCompletionStreamResponseDelta(TypedDict): - role: NotRequired[Literal["assistant"]] content: NotRequired[str] function_call: NotRequired[ChatCompletionFunctionCall] - - -ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta + tool_calls: NotRequired[List[ChatCompletionMessageToolCallChunk]] + role: NotRequired[Literal["system", "user", "assistant", "tool"]] class ChatCompletionStreamResponseChoice(TypedDict): index: int - delta: Union[ChatCompletionChunkDelta, ChatCompletionChunkDeltaEmpty] + delta: Union["ChatCompletionChunkDelta", "ChatCompletionChunkDeltaEmpty"] finish_reason: Optional[Literal["stop", "length", "function_call"]] -ChatCompletionChunkChoice = ChatCompletionStreamResponseChoice - - class ChatCompletionStreamResponse(TypedDict): id: str model: str object: Literal["chat.completion.chunk"] created: int - choices: List[ChatCompletionChunkChoice] - + choices: List["ChatCompletionChunkChoice"] -ChatCompletionChunk = ChatCompletionStreamResponse JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]] @@ -181,8 +159,90 @@ class ChatCompletionFunctionCallOption(TypedDict): name: str -class ChatCompletionRequestMessage(TypedDict): - role: Literal["assistant", "user", "system", "function"] +class ChatCompletionRequestMessageContentPartText(TypedDict): + type: Literal["text"] + text: str + + +class ChatCompletionRequestMessageContentPartImageImageUrl(TypedDict): + url: str + detail: NotRequired[Literal["auto", "low", "high"]] + + +class ChatCompletionRequestMessageContentPartImage(TypedDict): + type: Literal["image_url"] + image_url: ChatCompletionRequestMessageContentPartImageImageUrl + + +ChatCompletionRequestMessageContentPart = Union[ + ChatCompletionRequestMessageContentPartText, + ChatCompletionRequestMessageContentPartImage, +] + + +class ChatCompletionRequestSystemMessage(TypedDict): + role: Literal["system"] content: Optional[str] - name: NotRequired[str] - function_call: NotRequired[ChatCompletionFunctionCall] + + +class ChatCompletionRequestUserMessage(TypedDict): + role: Literal["user"] + content: Optional[Union[str, List[ChatCompletionRequestMessageContentPart]]] + + +class ChatCompletionMessageToolCallFunction(TypedDict): + name: str + arguments: str + + +class ChatCompletionMessageToolCall(TypedDict): + id: str + type: Literal["function"] + function: ChatCompletionMessageToolCallFunction + + +ChatCompletionMessageToolCalls = List[ChatCompletionMessageToolCall] + + +class ChatCompletionRequestAssistantMessage(TypedDict): + role: Literal["assistant"] + content: Optional[str] + tool_calls: NotRequired[ChatCompletionMessageToolCalls] + function_call: NotRequired[ChatCompletionFunctionCall] # DEPRECATED + + +class ChatCompletionRequestToolMessage(TypedDict): + role: Literal["tool"] + content: Optional[str] + tool_call_id: str + + +class ChatCompletionRequestFunctionMessage(TypedDict): + role: Literal["function"] + content: Optional[str] + name: str + + +ChatCompletionRequestMessage = Union[ + ChatCompletionRequestSystemMessage, + ChatCompletionRequestUserMessage, + ChatCompletionRequestAssistantMessage, + ChatCompletionRequestUserMessage, + ChatCompletionRequestToolMessage, + ChatCompletionRequestFunctionMessage, +] + +# NOTE: The following type names are not part of the OpenAI OpenAPI specification +# and will be removed in a future major release. + +EmbeddingData = Embedding +CompletionChunk = CreateCompletionStreamResponse +Completion = CreateCompletionResponse +ChatCompletionMessage = ChatCompletionResponseMessage +ChatCompletionChoice = ChatCompletionResponseChoice +ChatCompletion = CreateChatCompletionResponse +ChatCompletionChunkDeltaEmpty = ChatCompletionStreamResponseDeltaEmpty +ChatCompletionChunkChoice = ChatCompletionStreamResponseChoice +ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta +ChatCompletionChunk = ChatCompletionStreamResponse +ChatCompletionFunction = ChatCompletionResponseFunction diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 93afc3ee9..afd6a055b 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -688,7 +688,7 @@ async def create_completion( kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) iterator_or_completion: Union[ - llama_cpp.Completion, Iterator[llama_cpp.CompletionChunk] + llama_cpp.CreateCompletionResponse, Iterator[llama_cpp.CreateCompletionStreamResponse] ] = await run_in_threadpool(llama, **kwargs) if isinstance(iterator_or_completion, Iterator): @@ -697,7 +697,7 @@ async def create_completion( # If no exception was raised from first_response, we can assume that # the iterator is valid and we can use it to stream the response. - def iterator() -> Iterator[llama_cpp.CompletionChunk]: + def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: yield first_response yield from iterator_or_completion From 2a369f411fd083396a6917d21f7bfd19d56901b0 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 7 Nov 2023 00:59:29 -0500 Subject: [PATCH 13/26] Update ChatCompletionFunction type --- llama_cpp/llama_types.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index 43c4f081e..bff77a180 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -93,7 +93,7 @@ class ChatCompletionResponseMessage(TypedDict): function_call: NotRequired[ChatCompletionFunctionCall] -class ChatCompletionResponseFunction(TypedDict): +class ChatCompletionFunction(TypedDict): name: str description: NotRequired[str] parameters: Dict[str, Any] # TODO: make this more specific @@ -245,4 +245,4 @@ class ChatCompletionRequestFunctionMessage(TypedDict): ChatCompletionChunkChoice = ChatCompletionStreamResponseChoice ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta ChatCompletionChunk = ChatCompletionStreamResponse -ChatCompletionFunction = ChatCompletionResponseFunction +ChatCompletionResponseFunction = ChatCompletionFunction From 2ea2adfa5f2d545e872b622f463f152f67038de5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 7 Nov 2023 00:59:42 -0500 Subject: [PATCH 14/26] Reorder request parameters --- llama_cpp/server/app.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index afd6a055b..dd6169931 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -580,10 +580,6 @@ class CreateCompletionRequest(BaseModel): max_tokens: int = max_tokens_field temperature: float = temperature_field top_p: float = top_p_field - mirostat_mode: int = mirostat_mode_field - mirostat_tau: float = mirostat_tau_field - mirostat_eta: float = mirostat_eta_field - grammar: Optional[str] = None echo: bool = Field( default=False, description="Whether to echo the prompt in the generated text. Useful for chatbots.", @@ -610,6 +606,10 @@ class CreateCompletionRequest(BaseModel): top_k: int = top_k_field repeat_penalty: float = repeat_penalty_field logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) + mirostat_mode: int = mirostat_mode_field + mirostat_tau: float = mirostat_tau_field + mirostat_eta: float = mirostat_eta_field + grammar: Optional[str] = None model_config = { "json_schema_extra": { @@ -765,10 +765,6 @@ class CreateChatCompletionRequest(BaseModel): max_tokens: int = max_tokens_field temperature: float = temperature_field top_p: float = top_p_field - mirostat_mode: int = mirostat_mode_field - mirostat_tau: float = mirostat_tau_field - mirostat_eta: float = mirostat_eta_field - grammar: Optional[str] = None stop: Optional[List[str]] = stop_field stream: bool = stream_field presence_penalty: Optional[float] = presence_penalty_field @@ -784,6 +780,10 @@ class CreateChatCompletionRequest(BaseModel): top_k: int = top_k_field repeat_penalty: float = repeat_penalty_field logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) + mirostat_mode: int = mirostat_mode_field + mirostat_tau: float = mirostat_tau_field + mirostat_eta: float = mirostat_eta_field + grammar: Optional[str] = None model_config = { "json_schema_extra": { From 87fc84bb965f59aea0bb76a56c93b3a5272e423d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 7 Nov 2023 08:17:14 -0500 Subject: [PATCH 15/26] More API type fixes --- llama_cpp/llama.py | 6 +- llama_cpp/llama_chat_format.py | 34 ++++++++--- llama_cpp/llama_types.py | 108 ++++++++++++++++++++++----------- llama_cpp/server/app.py | 13 +++- 4 files changed, 110 insertions(+), 51 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index b4242ea04..f09b8d58a 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1879,7 +1879,9 @@ def create_chat_completion( self, messages: List[ChatCompletionRequestMessage], functions: Optional[List[ChatCompletionFunction]] = None, - function_call: Optional[Union[str, ChatCompletionFunctionCall]] = None, + function_call: Optional[ChatCompletionRequestFunctionCall] = None, + tools: List[ChatCompletionTool] = [], + tool_choice: Optional[ChatCompletionToolChoiceOption] = None, temperature: float = 0.2, top_p: float = 0.95, top_k: int = 40, @@ -1918,6 +1920,8 @@ def create_chat_completion( messages=messages, functions=functions, function_call=function_call, + tools=tools, + tool_choice=tool_choice, temperature=temperature, top_p=top_p, top_k=top_k, diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 30c505a53..103aa5ffd 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -11,12 +11,13 @@ class LlamaChatCompletionHandler(Protocol): def __call__( self, + *, llama: llama.Llama, messages: List[llama_types.ChatCompletionRequestMessage], functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[ - Union[str, llama_types.ChatCompletionFunctionCall] - ] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, temperature: float = 0.2, top_p: float = 0.95, top_k: int = 40, @@ -33,6 +34,7 @@ def __call__( model: Optional[str] = None, logits_processor: Optional[llama.LogitsProcessorList] = None, grammar: Optional[llama.LlamaGrammar] = None, + **kwargs, # type: ignore ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: ... @@ -239,10 +241,13 @@ def _convert_text_completion_chunks_to_chat( def _convert_completion_to_chat( completion_or_chunks: Union[ - llama_types.CreateCompletionResponse, Iterator[llama_types.CreateCompletionStreamResponse] + llama_types.CreateCompletionResponse, + Iterator[llama_types.CreateCompletionStreamResponse], ], stream: bool = False, -) -> Union[llama_types.CreateChatCompletionResponse, Iterator[llama_types.ChatCompletionChunk]]: +) -> Union[ + llama_types.CreateChatCompletionResponse, Iterator[llama_types.ChatCompletionChunk] +]: if stream: chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore return _convert_text_completion_chunks_to_chat(chunks) @@ -329,7 +334,9 @@ def get_chat_format(name: str): ) -def hf_autotokenizer_to_chat_formatter(pretrained_model_name_or_path: Union[str, os.PathLike[str]]) -> ChatFormatter: +def hf_autotokenizer_to_chat_formatter( + pretrained_model_name_or_path: Union[str, os.PathLike[str]] +) -> ChatFormatter: # https://huggingface.co/docs/transformers/main/chat_templating # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/blob/main/tokenizer_config.json @@ -538,7 +545,7 @@ def functionary_chat_handler( llama: llama.Llama, messages: List[llama_types.ChatCompletionRequestMessage], functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[Union[str, llama_types.ChatCompletionFunctionCall]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, temperature: float = 0.2, top_p: float = 0.95, top_k: int = 40, @@ -555,6 +562,7 @@ def functionary_chat_handler( model: Optional[str] = None, logits_processor: Optional[llama.LogitsProcessorList] = None, grammar: Optional[llama.LlamaGrammar] = None, + **kwargs, # type: ignore ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary""" @@ -636,7 +644,9 @@ def prepare_messages_for_inference( all_messages.append(message) all_messages.append( - llama_types.ChatCompletionRequestAssistantMessage(role="assistant", content=None) + llama_types.ChatCompletionRequestAssistantMessage( + role="assistant", content=None + ) ) def message_to_str(msg: llama_types.ChatCompletionRequestMessage): @@ -713,6 +723,9 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): prompt=new_prompt, stop=["user:", ""], stream=False ) # type: ignore + assert "usage" in completion + assert isinstance(function_call, str) + return llama_types.CreateChatCompletionResponse( id="chat" + completion["id"], object="chat.completion", @@ -741,7 +754,7 @@ def lava_1_5_chat_handler( llama: llama.Llama, messages: List[llama_types.ChatCompletionRequestMessage], functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[Union[str, llama_types.ChatCompletionFunctionCall]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, temperature: float = 0.2, top_p: float = 0.95, top_k: int = 40, @@ -758,6 +771,7 @@ def lava_1_5_chat_handler( model: Optional[str] = None, logits_processor: Optional[llama.LogitsProcessorList] = None, grammar: Optional[llama.LlamaGrammar] = None, + **kwargs, # type: ignore ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: # convert messages into a list of strings and images objects # for each item in list @@ -765,4 +779,4 @@ def lava_1_5_chat_handler( # if image, evaluate it and add empty string to prompt (for now) # generate completion items = [] - current_prompt = "" \ No newline at end of file + current_prompt = "" diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index bff77a180..cc4c1518f 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -48,49 +48,25 @@ class CompletionUsage(TypedDict): total_tokens: int -class CreateCompletionStreamResponse(TypedDict): - id: str - object: Literal["text_completion"] - created: int - model: str - choices: List[CompletionChoice] - - class CreateCompletionResponse(TypedDict): id: str object: Literal["text_completion"] created: int model: str choices: List[CompletionChoice] - usage: CompletionUsage + usage: NotRequired[CompletionUsage] -class ChatCompletionFunctionCall(TypedDict): +class ChatCompletionResponseFunctionCall(TypedDict): name: str arguments: str -class _ChatCompletionTextContent(TypedDict): - type: Literal["text"] - text: str - - -class _ChatCompletionImageUrlContentUrl(TypedDict): - url: str - - -class _ChatCompletionImageUrlContent(TypedDict): - type: Literal["image_url"] - image_url: _ChatCompletionImageUrlContentUrl - - class ChatCompletionResponseMessage(TypedDict): - role: Literal["assistant", "user", "system", "function"] - content: Optional[ - Union[str, _ChatCompletionTextContent, _ChatCompletionImageUrlContent] - ] - user: NotRequired[str] - function_call: NotRequired[ChatCompletionFunctionCall] + content: Optional[str] + tool_calls: NotRequired["ChatCompletionMessageToolCalls"] + role: Literal["assistant", "function"] # NOTE: "function" may be incorrect here + function_call: NotRequired[ChatCompletionResponseFunctionCall] # DEPRECATED class ChatCompletionFunction(TypedDict): @@ -101,7 +77,7 @@ class ChatCompletionFunction(TypedDict): class ChatCompletionResponseChoice(TypedDict): index: int - message: "ChatCompletionMessage" + message: "ChatCompletionResponseMessage" finish_reason: Optional[str] @@ -110,24 +86,36 @@ class CreateChatCompletionResponse(TypedDict): object: Literal["chat.completion"] created: int model: str - choices: List["ChatCompletionChoice"] + choices: List["ChatCompletionResponseChoice"] usage: CompletionUsage +class ChatCompletionMessageToolCallChunkFunction(TypedDict): + name: str + arguments: str + + class ChatCompletionMessageToolCallChunk(TypedDict): index: int id: NotRequired[str] type: Literal["function"] - function: ChatCompletionFunctionCall + function: ChatCompletionMessageToolCallChunkFunction class ChatCompletionStreamResponseDeltaEmpty(TypedDict): pass +class ChatCompletionStreamResponseDeltaFunctionCall(TypedDict): + name: str + arguments: str + + class ChatCompletionStreamResponseDelta(TypedDict): content: NotRequired[str] - function_call: NotRequired[ChatCompletionFunctionCall] + function_call: NotRequired[ + ChatCompletionStreamResponseDeltaFunctionCall + ] # DEPRECATED tool_calls: NotRequired[List[ChatCompletionMessageToolCallChunk]] role: NotRequired[Literal["system", "user", "assistant", "tool"]] @@ -171,7 +159,7 @@ class ChatCompletionRequestMessageContentPartImageImageUrl(TypedDict): class ChatCompletionRequestMessageContentPartImage(TypedDict): type: Literal["image_url"] - image_url: ChatCompletionRequestMessageContentPartImageImageUrl + image_url: Union[str, ChatCompletionRequestMessageContentPartImageImageUrl] ChatCompletionRequestMessageContentPart = Union[ @@ -204,11 +192,18 @@ class ChatCompletionMessageToolCall(TypedDict): ChatCompletionMessageToolCalls = List[ChatCompletionMessageToolCall] +class ChatCompletionRequestAssistantMessageFunctionCall(TypedDict): + name: str + arguments: str + + class ChatCompletionRequestAssistantMessage(TypedDict): role: Literal["assistant"] content: Optional[str] tool_calls: NotRequired[ChatCompletionMessageToolCalls] - function_call: NotRequired[ChatCompletionFunctionCall] # DEPRECATED + function_call: NotRequired[ + ChatCompletionRequestAssistantMessageFunctionCall + ] # DEPRECATED class ChatCompletionRequestToolMessage(TypedDict): @@ -232,12 +227,50 @@ class ChatCompletionRequestFunctionMessage(TypedDict): ChatCompletionRequestFunctionMessage, ] + +class ChatCompletionRequestFunctionCallOption: + name: str + + +ChatCompletionRequestFunctionCall = Union[ + Literal["none", "auto"], ChatCompletionRequestFunctionCallOption +] + +ChatCompletionFunctionParameters = Dict[str, JsonType] + + +class ChatCompletionToolFunction(TypedDict): + name: str + description: NotRequired[str] + parameters: ChatCompletionFunctionParameters + + +class ChatCompletionTool(TypedDict): + type: Literal["function"] + function: ChatCompletionToolFunction + + +class ChatCompletionNamedToolChoiceFunction(TypedDict): + name: str + + +class ChatCompletionNamedToolChoice(TypedDict): + type: Literal["function"] + function: ChatCompletionNamedToolChoiceFunction + + +ChatCompletionToolChoiceOption = Union[ + Literal["none", "auto"], ChatCompletionNamedToolChoice +] + + # NOTE: The following type names are not part of the OpenAI OpenAPI specification # and will be removed in a future major release. EmbeddingData = Embedding -CompletionChunk = CreateCompletionStreamResponse +CompletionChunk = CreateCompletionResponse Completion = CreateCompletionResponse +CreateCompletionStreamResponse = CreateCompletionResponse ChatCompletionMessage = ChatCompletionResponseMessage ChatCompletionChoice = ChatCompletionResponseChoice ChatCompletion = CreateChatCompletionResponse @@ -246,3 +279,4 @@ class ChatCompletionRequestFunctionMessage(TypedDict): ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta ChatCompletionChunk = ChatCompletionStreamResponse ChatCompletionResponseFunction = ChatCompletionFunction +ChatCompletionFunctionCall = ChatCompletionResponseFunctionCall diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index dd6169931..261c58445 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -748,20 +748,27 @@ class ChatCompletionRequestMessage(BaseModel): ) content: Optional[str] = Field(default="", description="The content of the message.") -from typing import Any class CreateChatCompletionRequest(BaseModel): - messages: List[Any] = Field( + messages: List[llama_cpp.ChatCompletionRequestMessage] = Field( default=[], description="A list of messages to generate completions for." ) functions: Optional[List[llama_cpp.ChatCompletionFunction]] = Field( default=None, description="A list of functions to apply to the generated completions.", ) - function_call: Optional[Union[Literal["auto", "none"], llama_cpp.ChatCompletionFunctionCallOption]] = Field( + function_call: Optional[llama_cpp.ChatCompletionRequestFunctionCall] = Field( default=None, description="A function to apply to the generated completions.", ) + tools: Optional[List[llama_cpp.ChatCompletionTool]] = Field( + default=None, + description="A list of tools to apply to the generated completions.", + ) + tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field( + default=None, + description="A tool to apply to the generated completions.", + ) # TODO: verify max_tokens: int = max_tokens_field temperature: float = temperature_field top_p: float = top_p_field From 5091b9c5645547f69e652c9091556e6ec5b0d064 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 7 Nov 2023 10:14:45 -0500 Subject: [PATCH 16/26] Even More Type Updates --- llama_cpp/llama.py | 12 ++++-- llama_cpp/llama_chat_format.py | 75 +++++++++++++++++++--------------- llama_cpp/llama_types.py | 13 +++--- 3 files changed, 59 insertions(+), 41 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index f09b8d58a..ebb90046e 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1297,7 +1297,9 @@ def _create_completion( stopping_criteria: Optional[StoppingCriteriaList] = None, logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, - ) -> Union[Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]]: + ) -> Union[ + Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse] + ]: assert self._ctx is not None assert suffix is None or suffix.__class__ is str @@ -1880,7 +1882,7 @@ def create_chat_completion( messages: List[ChatCompletionRequestMessage], functions: Optional[List[ChatCompletionFunction]] = None, function_call: Optional[ChatCompletionRequestFunctionCall] = None, - tools: List[ChatCompletionTool] = [], + tools: Optional[List[ChatCompletionTool]] = None, tool_choice: Optional[ChatCompletionToolChoiceOption] = None, temperature: float = 0.2, top_p: float = 0.95, @@ -1898,7 +1900,9 @@ def create_chat_completion( model: Optional[str] = None, logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, - ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: + ) -> Union[ + CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse] + ]: """Generate a chat completion from a list of messages. Args: @@ -1916,7 +1920,7 @@ def create_chat_completion( """ handler = llama_chat_format.get_chat_completion_handler(self.chat_format) return handler( - self, + llama=self, messages=messages, functions=functions, function_call=function_call, diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 103aa5ffd..a10bc70cc 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -35,7 +35,7 @@ def __call__( logits_processor: Optional[llama.LogitsProcessorList] = None, grammar: Optional[llama.LlamaGrammar] = None, **kwargs, # type: ignore - ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: + ) -> Union[llama_types.CreateChatCompletionResponse, Iterator[llama_types.CreateChatCompletionStreamResponse]]: ... @@ -749,34 +749,45 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): ) -@register_chat_completion_handler("llava-1.5") -def lava_1_5_chat_handler( - llama: llama.Llama, - messages: List[llama_types.ChatCompletionRequestMessage], - functions: Optional[List[llama_types.ChatCompletionFunction]] = None, - function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, - temperature: float = 0.2, - top_p: float = 0.95, - top_k: int = 40, - stream: bool = False, - stop: Optional[Union[str, List[str]]] = [], - max_tokens: int = 256, - presence_penalty: float = 0.0, - frequency_penalty: float = 0.0, - repeat_penalty: float = 1.1, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - logits_processor: Optional[llama.LogitsProcessorList] = None, - grammar: Optional[llama.LlamaGrammar] = None, - **kwargs, # type: ignore -) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: - # convert messages into a list of strings and images objects - # for each item in list - # if string, process it and append to prompt - # if image, evaluate it and add empty string to prompt (for now) - # generate completion - items = [] - current_prompt = "" +class Llava15ChatHandler: + def __init__(self, clip_model_path: str): + self.clip_model_path = clip_model_path + + def chat_handler( + self, + llama: llama.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + max_tokens: int = 256, + presence_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + logits_processor: Optional[llama.LogitsProcessorList] = None, + grammar: Optional[llama.LlamaGrammar] = None, + **kwargs, # type: ignore + ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: + # convert messages into a list of strings and images objects + # for each item in list + # if string, process it and append to prompt + # if image, evaluate it and add empty string to prompt (for now) + # generate completion + items = [] + current_prompt = "" + system_prompt = "" + for message in messages: + if message["role"] == "system" and message["content"] is not None: + system_prompt = message["content"] + if message["role"] == "user": + items.append(message["content"]) + current_prompt += message["content"] \ No newline at end of file diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index cc4c1518f..b49d8594c 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -65,7 +65,7 @@ class ChatCompletionResponseFunctionCall(TypedDict): class ChatCompletionResponseMessage(TypedDict): content: Optional[str] tool_calls: NotRequired["ChatCompletionMessageToolCalls"] - role: Literal["assistant", "function"] # NOTE: "function" may be incorrect here + role: Literal["assistant", "function"] # NOTE: "function" may be incorrect here function_call: NotRequired[ChatCompletionResponseFunctionCall] # DEPRECATED @@ -122,16 +122,18 @@ class ChatCompletionStreamResponseDelta(TypedDict): class ChatCompletionStreamResponseChoice(TypedDict): index: int - delta: Union["ChatCompletionChunkDelta", "ChatCompletionChunkDeltaEmpty"] + delta: Union[ + ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty + ] finish_reason: Optional[Literal["stop", "length", "function_call"]] -class ChatCompletionStreamResponse(TypedDict): +class CreateChatCompletionStreamResponse(TypedDict): id: str model: str object: Literal["chat.completion.chunk"] created: int - choices: List["ChatCompletionChunkChoice"] + choices: List[ChatCompletionStreamResponseChoice] JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]] @@ -277,6 +279,7 @@ class ChatCompletionNamedToolChoice(TypedDict): ChatCompletionChunkDeltaEmpty = ChatCompletionStreamResponseDeltaEmpty ChatCompletionChunkChoice = ChatCompletionStreamResponseChoice ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta -ChatCompletionChunk = ChatCompletionStreamResponse +ChatCompletionChunk = CreateChatCompletionStreamResponse +ChatCompletionStreamResponse = CreateChatCompletionStreamResponse ChatCompletionResponseFunction = ChatCompletionFunction ChatCompletionFunctionCall = ChatCompletionResponseFunctionCall From 22a776d5936f7e62c62a9637a128acaf90ef2a32 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 7 Nov 2023 10:15:08 -0500 Subject: [PATCH 17/26] Add parameter for custom chat_handler to Llama class --- llama_cpp/llama.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index ebb90046e..9cf50fc27 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -752,6 +752,7 @@ def __init__( numa: bool = False, # Chat Format Params chat_format: str = "llama-2", + chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None, # Misc verbose: bool = True, # Extra Params @@ -784,6 +785,7 @@ def __init__( lora_path: Path to a LoRA file to apply to the model. numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init) chat_format: String specifying the chat format to use when calling create_chat_completion. + chat_handler: Optional chat handler to use when calling create_chat_completion. verbose: Print verbose output to stderr. Raises: @@ -910,6 +912,7 @@ def __init__( print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) self.chat_format = chat_format + self.chat_handler = chat_handler self._n_vocab = self.n_vocab() self._n_ctx = self.n_ctx() @@ -1918,7 +1921,9 @@ def create_chat_completion( Returns: Generated chat completion or a stream of chat completion chunks. """ - handler = llama_chat_format.get_chat_completion_handler(self.chat_format) + handler = self.chat_handler or llama_chat_format.get_chat_completion_handler( + self.chat_format + ) return handler( llama=self, messages=messages, @@ -1982,6 +1987,7 @@ def __getstate__(self): numa=self.numa, # Chat Format Params chat_format=self.chat_format, + chat_handler=self.chat_handler, # Misc verbose=self.verbose, ) @@ -2023,6 +2029,7 @@ def __setstate__(self, state): numa=state["numa"], # Chat Format Params chat_format=state["chat_format"], + chat_handler=state["chat_handler"], # Misc verbose=state["verbose"], ) From 5ac81151665532471392b71eea959a23d17e2863 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 7 Nov 2023 15:13:44 -0500 Subject: [PATCH 18/26] Fix circular import --- llama_cpp/llama_chat_format.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index a10bc70cc..c029312da 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -4,8 +4,8 @@ import dataclasses from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, Protocol -from . import llama_types -from . import llama +import llama_cpp.llama_types as llama_types +import llama_cpp.llama as llama class LlamaChatCompletionHandler(Protocol): From cb749f2449e85120ba2c3603734a1517a2e20cc6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 7 Nov 2023 15:28:35 -0500 Subject: [PATCH 19/26] Convert to absolute imports --- llama_cpp/llama.py | 2 +- llama_cpp/llama_grammar.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 9cf50fc27..fd219e28c 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -21,9 +21,9 @@ import diskcache import ctypes -from . import llama_cpp from .llama_types import * from .llama_grammar import LlamaGrammar +import llama_cpp.llama_cpp as llama_cpp import llama_cpp.llama_chat_format as llama_chat_format import numpy as np diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py index 29431d957..ccbea574b 100644 --- a/llama_cpp/llama_grammar.py +++ b/llama_cpp/llama_grammar.py @@ -19,7 +19,7 @@ overload, ) -from . import llama_cpp +import llama_cpp.llama_cpp as llama_cpp # Type aliases llama_grammar_element = llama_cpp.llama_grammar_element From d2d2a2d470a12ad278966ad63fcc2ddbe038d5c1 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 7 Nov 2023 15:29:03 -0500 Subject: [PATCH 20/26] Fix --- llama_cpp/llama_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index b49d8594c..3683dd844 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -230,7 +230,7 @@ class ChatCompletionRequestFunctionMessage(TypedDict): ] -class ChatCompletionRequestFunctionCallOption: +class ChatCompletionRequestFunctionCallOption(TypedDict): name: str From 177114c368fd9dcc867135974af189654026a135 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 7 Nov 2023 22:12:43 -0500 Subject: [PATCH 21/26] Fix pydantic Jsontype bug --- llama_cpp/llama_types.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index 3683dd844..69d07fc92 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -10,6 +10,12 @@ from typing_extensions import TypedDict, NotRequired, Literal +# NOTE: Defining this correctly using annotations seems to break pydantic validation. +# This is a workaround until we can figure out how to do this correctly +# JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]] +JsonType = Union[None, int, str, bool, List[Any], Dict[str, Any]] + + class EmbeddingUsage(TypedDict): prompt_tokens: int total_tokens: int @@ -72,7 +78,7 @@ class ChatCompletionResponseMessage(TypedDict): class ChatCompletionFunction(TypedDict): name: str description: NotRequired[str] - parameters: Dict[str, Any] # TODO: make this more specific + parameters: Dict[str, JsonType] # TODO: make this more specific class ChatCompletionResponseChoice(TypedDict): @@ -136,9 +142,6 @@ class CreateChatCompletionStreamResponse(TypedDict): choices: List[ChatCompletionStreamResponseChoice] -JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]] - - class ChatCompletionFunctions(TypedDict): name: str description: NotRequired[str] @@ -238,7 +241,7 @@ class ChatCompletionRequestFunctionCallOption(TypedDict): Literal["none", "auto"], ChatCompletionRequestFunctionCallOption ] -ChatCompletionFunctionParameters = Dict[str, JsonType] +ChatCompletionFunctionParameters = Dict[str, JsonType] # TODO: make this more specific class ChatCompletionToolFunction(TypedDict): From 21165e7d2b87092fc194a453d35f68001c5b8a5c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 7 Nov 2023 22:13:04 -0500 Subject: [PATCH 22/26] Accept list of prompt tokens in create_completion --- llama_cpp/llama.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index fd219e28c..7a2c34f45 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1279,7 +1279,7 @@ def embed(self, input: str) -> List[float]: def _create_completion( self, - prompt: str, + prompt: Union[str, List[int]], suffix: Optional[str] = None, max_tokens: int = 16, temperature: float = 0.8, @@ -1314,7 +1314,7 @@ def _create_completion( self.tokenize(prompt.encode("utf-8"), special=True) if prompt != "" else [self.token_bos()] - ) + ) if isinstance(prompt, str) else prompt text: bytes = b"" returned_tokens: int = 0 stop = ( @@ -1327,7 +1327,7 @@ def _create_completion( if len(prompt_tokens) >= self._n_ctx: raise ValueError( - f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self._ctx)}" + f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}" ) if max_tokens <= 0: @@ -1737,7 +1737,7 @@ def _create_completion( def create_completion( self, - prompt: str, + prompt: Union[str, List[int]], suffix: Optional[str] = None, max_tokens: int = 128, temperature: float = 0.8, From 74c414c7eba6e3812fb8256262e6d448d2ce501b Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 7 Nov 2023 22:13:29 -0500 Subject: [PATCH 23/26] Add llava1.5 chat handler --- llama_cpp/llama_chat_format.py | 104 ++++++++++++++++++++++++++++----- llama_cpp/server/app.py | 13 +++++ 2 files changed, 102 insertions(+), 15 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index c029312da..60b38d84d 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import ctypes import dataclasses from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, Protocol @@ -725,6 +726,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): assert "usage" in completion assert isinstance(function_call, str) + assert stream is False # TODO: support stream mode return llama_types.CreateChatCompletionResponse( id="chat" + completion["id"], @@ -751,14 +753,40 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): class Llava15ChatHandler: def __init__(self, clip_model_path: str): + import llama_cpp.llava_cpp as llava_cpp + + self._llava_cpp = llava_cpp self.clip_model_path = clip_model_path - def chat_handler( + self.clip_ctx = self._llava_cpp.clip_model_load(self.clip_model_path.encode(), 0) + + def __del__(self): + if self.clip_ctx is not None: + self._llava_cpp.clip_free(self.clip_ctx) + self.clip_ctx = None + + def load_image(self, image_url: str) -> bytes: + if image_url.startswith("data:"): + import base64 + + image_bytes = base64.b64decode(image_url.split(",")[1]) + return image_bytes + else: + import urllib.request + + with urllib.request.urlopen(image_url) as f: + image_bytes = f.read() + return image_bytes + + def __call__( self, + *, llama: llama.Llama, messages: List[llama_types.ChatCompletionRequestMessage], functions: Optional[List[llama_types.ChatCompletionFunction]] = None, function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, temperature: float = 0.2, top_p: float = 0.95, top_k: int = 40, @@ -776,18 +804,64 @@ def chat_handler( logits_processor: Optional[llama.LogitsProcessorList] = None, grammar: Optional[llama.LlamaGrammar] = None, **kwargs, # type: ignore - ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: - # convert messages into a list of strings and images objects - # for each item in list - # if string, process it and append to prompt - # if image, evaluate it and add empty string to prompt (for now) - # generate completion - items = [] - current_prompt = "" - system_prompt = "" + ) -> Union[llama_types.CreateChatCompletionResponse, Iterator[llama_types.CreateChatCompletionStreamResponse]]: + assert llama.context_params.logits_all is True # BUG: logits_all=True is required for llava + assert self.clip_ctx is not None + system_prompt = _get_system_message(messages) + system_prompt = system_prompt if system_prompt != "" else "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." + system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." + user_role = "\nUSER:" + assistant_role = "\nASSISTANT:" + llama.reset() + llama.eval(llama.tokenize(system_prompt.encode("utf8"), add_bos=True)) for message in messages: - if message["role"] == "system" and message["content"] is not None: - system_prompt = message["content"] - if message["role"] == "user": - items.append(message["content"]) - current_prompt += message["content"] \ No newline at end of file + if message["role"] == "user" and message["content"] is not None: + if isinstance(message["content"], str): + llama.eval(llama.tokenize(f"{user_role} {message['content']}".encode("utf8"), add_bos=False)) + else: + assert isinstance(message["content"], list) + llama.eval(llama.tokenize(f"{user_role} ".encode("utf8"), add_bos=False)) + for content in message["content"]: + if content["type"] == "text": + llama.eval(llama.tokenize(f"{content['text']}".encode("utf8"), add_bos=False)) + if content["type"] == "image_url": + image_bytes = self.load_image(content["image_url"]["url"]) if isinstance(content["image_url"], dict) else self.load_image(content["image_url"]) + import array + data_array = array.array('B', image_bytes) + c_ubyte_ptr = (ctypes.c_ubyte * len(data_array)).from_buffer(data_array) + embed = self._llava_cpp.llava_image_embed_make_with_bytes(ctx_clip=self.clip_ctx, n_threads=llama.context_params.n_threads, image_bytes=c_ubyte_ptr, image_bytes_length=len(image_bytes)) + # image_bytes_p = (ctypes.c_uint8 * len(image_bytes)).from_buffer_copy(image_bytes) + # embed = self._llava_cpp.llava_image_embed_make_with_bytes(ctx_clip=self.clip_ctx, n_threads=1, image_bytes=image_bytes_p, image_bytes_length=len(image_bytes)) + try: + n_past = ctypes.c_int(llama.n_tokens) + n_past_p = ctypes.pointer(n_past) + self._llava_cpp.llava_eval_image_embed(ctx_llama=llama.ctx, embed=embed, n_batch=llama.n_batch, n_past=n_past_p) + assert llama.n_ctx() >= n_past.value + llama.n_tokens = n_past.value + finally: + self._llava_cpp.llava_image_embed_free(embed) + if message["role"] == "assistant" and message["content"] is not None: + llama.eval(llama.tokenize(f"ASSISTANT: {message['content']}".encode("utf8"), add_bos=False)) + llama.eval(llama.tokenize(f"{assistant_role}".encode("utf8"), add_bos=False)) + + prompt = llama._input_ids.tolist() + + return _convert_completion_to_chat(llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + stream=stream, + stop=stop, + max_tokens=max_tokens, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + model=model, + logits_processor=logits_processor, + grammar=grammar, + ), stream=stream) \ No newline at end of file diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 261c58445..8ebc427ec 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -138,6 +138,10 @@ class Settings(BaseSettings): default="llama-2", description="Chat format to use.", ) + clip_model_path: Optional[str] = Field( + default=None, + description="Path to a CLIP model to use for multi-modal chat completion.", + ) # Cache Params cache: bool = Field( default=False, @@ -375,6 +379,14 @@ def create_app(settings: Optional[Settings] = None): ) app.include_router(router) global llama + + ## + chat_handler = None + if settings.chat_format == "llava-1-5": + assert settings.clip_model_path is not None + chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(clip_model_path=settings.clip_model_path) + ## + llama = llama_cpp.Llama( model_path=settings.model, # Model Params @@ -411,6 +423,7 @@ def create_app(settings: Optional[Settings] = None): numa=settings.numa, # Chat Format Params chat_format=settings.chat_format, + chat_handler=chat_handler, # Misc verbose=settings.verbose, ) From 34aa8588f73ddcecbaccb4eee7c48a81e1e992b0 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 7 Nov 2023 22:14:48 -0500 Subject: [PATCH 24/26] Add Multimodal notebook --- examples/notebooks/Multimodal.ipynb | 84 +++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 examples/notebooks/Multimodal.ipynb diff --git a/examples/notebooks/Multimodal.ipynb b/examples/notebooks/Multimodal.ipynb new file mode 100644 index 000000000..11b14df38 --- /dev/null +++ b/examples/notebooks/Multimodal.ipynb @@ -0,0 +1,84 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ChatCompletion(id='chatcmpl-65a710ba-41d1-4d0a-a124-a44b2b4a0189', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content=' The image reads \"LlamaC++.\"', role='assistant', function_call=None, tool_calls=None))], created=1699413274, model='gpt-4-vision-preview', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=10, prompt_tokens=624, total_tokens=634))\n" + ] + } + ], + "source": [ + "from openai import OpenAI\n", + "\n", + "import urllib.request\n", + "import base64\n", + "\n", + "def get_data_url(url):\n", + " return \"data:image/png;base64,\" + base64.b64encode(urllib.request.urlopen(url).read()).decode(\"utf-8\")\n", + "\n", + "client = OpenAI(base_url=\"http://100.64.159.73:8000/v1\", api_key=\"sk-1234\")\n", + "response = client.chat.completions.create(\n", + " model=\"gpt-4-vision-preview\",\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"image_url\",\n", + " \"image_url\": {\n", + " \"url\": get_data_url(\"https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png\"),\n", + " # \"url\": \"https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png\",\n", + " },\n", + " },\n", + " {\"type\": \"text\", \"text\": \"What does the image say\"},\n", + " ],\n", + " }\n", + " ],\n", + ")\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5+" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 66dda361bbe86e19017fda72df541042afe2ee90 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 7 Nov 2023 22:21:38 -0500 Subject: [PATCH 25/26] Clean up examples --- examples/multimodal/llava.py | 95 ------------------------- examples/multimodal/overfitting_lc.png | Bin 5982 -> 0 bytes 2 files changed, 95 deletions(-) delete mode 100644 examples/multimodal/llava.py delete mode 100644 examples/multimodal/overfitting_lc.png diff --git a/examples/multimodal/llava.py b/examples/multimodal/llava.py deleted file mode 100644 index ac5c33a73..000000000 --- a/examples/multimodal/llava.py +++ /dev/null @@ -1,95 +0,0 @@ -import ctypes -import argparse -import os -import array -import sys - -from llama_cpp import Llama -from llama_cpp.llava_cpp import ( - clip_model_load, - llava_image_embed_make_with_filename, - llava_image_embed_make_with_bytes, - llava_image_embed_free, - llava_validate_embed_size, - llava_eval_image_embed, -) - -parser = argparse.ArgumentParser() -parser.add_argument( - "-m", "--model", type=str, default="../models/llava-v1.5-7b/ggml-model-q5_k.gguf" -) -parser.add_argument("--mmproj", type=str, default="llava-v1.5-7b/mmproj-model-f16.gguf") -parser.add_argument("-t", "--temp", type=float, default=0.1) -parser.add_argument( - "-p", "--prompt", type=str, default="Describe this image in detail." -) -args = parser.parse_args() - -print(f"loading clip model from {args.mmproj}") -if not os.path.exists(args.mmproj): - raise FileNotFoundError(args.mmproj) -ctx_clip = clip_model_load(fname=args.mmproj.encode("utf-8"), verbosity=0) - -image_path = os.path.join(os.path.dirname(__file__), "overfitting_lc.png") -if not os.path.exists(image_path): - raise FileNotFoundError(image_path) -image_embed = llava_image_embed_make_with_filename( - ctx_clip=ctx_clip, n_threads=1, image_path=image_path.encode("utf8") -) - - -def load_image_embed_from_file_bytes(image_path: str): - with open(image_path, "rb") as file: - image_bytes = file.read() - bytes_length = len(image_bytes) - data_array = array.array("B", image_bytes) - c_ubyte_ptr = (ctypes.c_ubyte * len(data_array)).from_buffer(data_array) - return llava_image_embed_make_with_bytes( - ctx_clip=ctx_clip, - n_threads=1, - image_bytes=c_ubyte_ptr, - image_bytes_length=bytes_length, - ) - - -print(f"loading llm model from {args.model}") -if not os.path.exists(args.model): - raise FileNotFoundError(args.model) -llm = Llama( - model_path=args.model, n_ctx=2048, n_gpu_layers=1 -) # longer context needed for image embeds - -if not llava_validate_embed_size(llm.ctx, ctx_clip): - raise RuntimeError("llm and mmproj model embed size mismatch") - -# eval system prompt -system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n" -llm.eval(llm.tokenize(system_prompt.encode("utf8"))) -llm.eval(llm.tokenize("\nUSER: ".encode("utf8"))) - -# eval image embed -n_past = ctypes.c_int(llm.n_tokens) -n_past_p = ctypes.pointer(n_past) -llava_eval_image_embed(llm.ctx, image_embed, llm.n_batch, n_past_p) -llm.n_tokens = n_past.value -llava_image_embed_free(image_embed) - -# eval prompt -prompt = "Describe the visual content of this image" -llm.eval(llm.tokenize(prompt.encode("utf8"))) -llm.eval(llm.tokenize("\nASSISTANT:".encode("utf8"))) - -# get output -print("\n") -max_target_len = 256 -for i in range(max_target_len): - t_id = llm.sample(temp=0.1) - t = llm.detokenize([t_id]).decode("utf8") - if t == "": - break - print(t, end="") - sys.stdout.flush() - llm.eval([t_id]) - -print("\n") -print("done") diff --git a/examples/multimodal/overfitting_lc.png b/examples/multimodal/overfitting_lc.png deleted file mode 100644 index 591b34c68e1ca19bab4d790de6c98e70e773fdf9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5982 zcmV-k7oq5hP)p{QaDqod5j#`p>m=inHC_-Hwiq{N&cw*4FTkP}$kpNMxHz zXPs?rZSU~*{NK*^`1@FKr03`7MPizXiHY^Gg}Au5rnAk3m%H(mS!#i)Z^*w z^Wo+0Z-uZ!V3}iMV{mYA{`d0!@aUnh%khy@TXd!7>G9*|@M>ym|JvOD_4fb${zODX z@|s{%ZJ~B{c3@y&;o;%`-QWNG`@X)u^{;|GT$cF2o%5Y${_^Yo_3!cX_uk^|@sn29 z-01w`(*NM%&CSjF)4Jp1B&Zl#!8&3;zNJvP^%F0eoPWi*4^{syW>fVHeg!#y+^Pz0_xsrH{wf3`#``gEfn!We7j*Xqa zn3$OKqj1N^$JN^Bke|U#Xr8gX)avT$$Isw%b94Xt`v2nPs;a6uS(N_s?fcrr^{9CA zpK1EdvaGDEq@<*Ple%Ggsgk0?``yah-|CsD#>CCu!OGlwdwX47UH{kG|M2qv=IFMz zw$jql_q&z!rgXl?+5i9mR#sNg*5&`}?fvcK_r00@?&VWcQ~TDwt+~-=eXG2@y#IfN z|MT?!*4WU{(D3l^|M&QPkhuBBr_Ti>izBG;NsaB2|BRFWh>!lr%=N2#`O2&&CMKDgng6S>|GK{ag^d5Qx2>(M{QdnvKtTV_(G&{{ zsWV0LcS8QPr~X<|`A#*CYi$e$2HJp;(y6=IS8Hrmg^!Q2o{RSO000yjNklvzo^!9Z=k_+e=iGVx5Jc2~3E3eeS^rO)?C04l`LWi!-?i4e-WMPu zA|fIpA|j$m%mN01Ag=~9*#v^zux1ii3c`{M6SJfM01Pt%W-M4UoNu4Y7yv?Y4RULk z5IsF-u*{I4kqzr^Pad;2UP7(BOD%FALftaXV6z)RBO6w8GX-{LB^>MTnGJH#nVZKL zBm;2_SQ#Gy)XGM*p18QX*Am9<7vt-P2pZY2p4tP;pX345P)N@J0Lw)_&IFBYSiKpy z%tFpF2Adt20T~6%+QnwZ_BjO2n_?RnV$fn)eMSRd@3Tc;KLF(!b@h$|1kIb`oa6xP z#t=6NV0Lox02`nx3+_4)GDJi~L_|bHL_|bHM7pVMiwFaG#a`hvgHVuDH8+0TMkvUq zfZJ!!CXlWrm#FqXpFMkntog|YYry`-6Z<>@LOxp5I|kI`H@oK(&BXPNff^cp|8EE^ zx%Hdh5?FF;&fgPQa_hYhv;@|Ck69{j^sn}439KgGC6zb&;qSBr*6OVAeZ;`O4Hm-* z03dkaIoDjpjehyp8V-wJ{ZQ(j8m+NwGEr@#pZrF{VJ$f19VNXo>EIn9lbdm2>7`#z zy)NxrRmG+ARNCmL|D^E-ej)dAn$&FV=gpl@4E(sTjFtu-VgdkWU9(-2PhgGfIyhCw zb@2fJ`Kws_RoLja|DXkamUu^8wbQV-WqN@a^zoh2NY}t9y7ANupq^ zXV&I0@k0(6yW58v<3;Gr04$tgxVt&vPG&PCb?&zoU38NS#@F6M&2Hc$$Gkkp8jAho zM2M)Tol*}hP}NqOnE+tE&bESnvvuYDF>Z)*^UU;XV7!evcrw1>1+2Wx*`S&&Kn>ht z=NWt<9fmc#Az2v$+{s|DpEN>JXTeYtEbw<&^&0^|(t&Uf+8dqo^B6aV67D+eo(*h^ zk55=}_XVuRi)^WZU*r>?^Q3nK*1abIz`$)Y_RK-RUZ1HhSP+K=E_DKcBwjJ?zUi0- zo^p0?IAMr}%#+mM?h#l6=cKTPe!as)!b@0#GnWGex0z~(h3E)w9{`|&6-@U|;s;~K zI+SNP+XJ0GfM`X*3s?@fPr^8-8E%<_a9H^fkWd5E>;}~n0?AJ~9QDJZuBRSO2LO}v z!sj&mrejCZSmwryfJG~Mdv_azAjAgT z01yS$iDu6RhU}NI-L?VLBKLYXfO&@rh{`4EghlPL3S$Dm1yj>1S{r@;Z?$A1J=(}W zvQB0I5Nz-~P3uaaR)!q$H^=Ro4uErzwT{kR>i7Ssg|HAA!TlNlOx6iqXf#`2eX5PH zP;z)@3;^sq%_^g>(flh~35$x#j!g#uo2)GQjQj^$8dK`jy*F$D0Ko>~T-p{Ov^J)c zsxI6i1|T?Sw~4Opo{xXlQdr=3S@$yln6I;Qp=kkP%vFa;&^!-S!a}60cefFMd0SKH z3PqZxCQ^d=O<4yLRS65em!+Xw`%J^>c_peTrj%n-#A2O(kgrl$de>7A#{j?)&$YC8 z^6D}0NloFxxf_&muH}k@m$Z4eL<7h_$XZVatuMcs`~hhG+*H`%r`)%f^U%OUt3!MM zM%V04(GYL?$sZ=0)%=QLyVFWXQOOBQe9Su834lqk@Sx_s@{P(mgjp5JyNU81M_I7Z ztDgZNse`vc>37+8|2Ubb@~hVKO4lUO;P%LUc4vXB_ZbR6)qzq!W!s%p1D~`e#d8xq zk$LhTbSgQ_EdhWs&#WZPfko{)hf63xu-|j7lF>Dv{eF^A%_>_Z zEUut!DNd0ibZuU)(E##YSabJ510=Z6#7ilnF>Fc^SX8xj7zJQflkGaCHub;#=ShK8 zQG9=H8J)473LVa5hFTvJ128GFEmFn|Ooq)e@$0Z%yMf-&1I13V@^{4bPX{o6lW>j6 zl!ldbikEVY?!ZL_kNx__dbQ~RNbF!Kv0n4(pH$tgDzb#Z9<+xoDtbmbJ=Xg=1z=u> z?Sp(wO_}(ewq5I|f(I)Ki`wpg`6vKXZS{OGQCI%T$0{9xn|P(qRT4U^@acOh$}2V+ z0Fq2oCTc%FR^dt3`3J*U>#I}~IIMW80;8LOSA7A%rkHnRVtq?>2Hxbz0lTe!Mk;#x z1oAqFL#>wZ0SL-W!zVUP{NV3Y^lPd%@k*bo&S9QN5{&6ftdge#5ZIKaOo-OBuTn*c49WDGH{D7ITw#sQ`nETqM9+b95>vXra| z&{9Dz8iCDRp{YwEF!f*|@^JRo005hCPsK+f?FK%->Xgv4DN%qa2@Bi({nIf#5*2SyWt&VK9JFvkVn6j{dHY+JDR2AD@ zQfwh)&u27xRvxEqvo6g7rZg;s#z{j?1vXoGE_>883|Ft?r4)TWNs~?`VDXQ|JrV73P43z(Sg<_i<^~43i?()(i5!6!YIx-d`^} z%`$aK6hL!eAuWR2CY|H&aq-NO*WprnB5hK(N@#j*(zlqD%_h0f-@in9#o3>dB4@fl zWexo5O`fdtHUelGEGjDXaS1g%J=@AElIy>%)Z(9s%{rFn`XqI%X)B$llxo{Ke4rYb z7c6`x&*8)?b0*vz$L17|x5s55Yg1F22*vKZq{57_wZ#m5Q8^vQvIo}6++O97%2Q(cP_+5pe;*r0~w%#AVbA6&{ zV95Lp&umLKRYA?7DQmayMyvkmBY!0o3A0@2o8Bh6JQJ3lvQhsAI9(~uePaV}zTB?c zv!R%^YV2R9NHE*1q=Q+klxv?)g+$-9<%3Ui4eTD3v%|b8h1K-EKv@Hy=y#qc+^Xu0 zh)IWqlFQj`qtd}lD+{(Qe!fR3Sc#W9c*5WZ^8syxg_u-s-{|ByCi@St4#@na@`QE^ z5?7xuWrY`2nE+Y{OKf=ho zkzVTiHKoF}F0Xfvw;e|@*+19RHoT~8YN4XJ;0M*mO^tbdG(9h7r4-qi3*>Z4%`Y4Y z_GE6s8$GH(DZ+Mgy&se;<2S>dc$29OJHZH-Y_f3Ock_4E7qPdg|quY9T zH}Mr*W$Sph;d3v&7>=?JXL8D#)~499=2lE4o32V&K)2WBNLecee!=0w`)iy0R_}Ym zSDlvlxySzVYi(^k51dz7Alj6KHGNm$)ihZ*rd@b&YAx%2_~}j8Tz>!kJ`*D&j>-Ha znZi1j7@9K9|^2iJcwlLvHKRO zocX%ldStzMELz%_AW_+oSom^->@}P99f9?ljFPRc%I;hA2HW&2yCG_tIb z7+NUP-y#NHrmC%6WzZs}k!6p>YO6zUu0d!`X=J)-Ewy%igVwY?1eQ!uOT910=)E?P z{vClOTg;nYu}AfYftPbu-R7OT@Rj{UjW3O?LK(IB`$svA`rwl<39PaER#dis8wHKm zq`(4o^l)JJ%lIv=DcPqQ=^c&r+9f6iUQVSd70^2&GP-mc+A2rVgKt5RCW3d5p_ zftPpcbId9>QV+mqq-VWdjECIEeG+1`nOBN6V zFQ=lztpAS~czMN7{@?#uND)|a&J_RqFWGHQ1eSdI@V`>IyT+{DC~$k}giw~t@|ss9IOrfOsRzF>qVQ4Vo46!y znT8VB=UU`Zq8~tozaAOIBz?}MxoGra zc5tmW_lXHCxy0z}3gm|MNeC>t@@FwiHfMXewk7qe4fEe?mFl{=x-E^N?XZ+S$!f%p zE;+N~vbVSQPJh>9OZ%cD7l@}DX(ueY4h`_PKU{J(!ry!EUT(ypuF!|wUvDo-^W`(N z3KpGd?50N^jq0xMYB{`f@7_YM?9{^%u`O3Oc0CI1-*vP5_}6{g7na1Nr$tBl^7&#W z!$|XB(Ow$aXu6ocAhIOtX6TZzmWb5s>})QVyK|?vx3{;KmzS59S72aZU|?WeT-;`j zgQe<>A&dplF-NyYwH-fmbLp=B>d@rmN5_sW>FV0CWAj6egQd<~G=mNr21}i%!e;-N z&^SZ(tycZF=*Bzxldu2yi@w_A^X}OjPa9=!hn#%(EjoMl-O1N~{`pFn9FH|h*E zXBg618d>3Cc;m-}N=>p^>dYpY7gTOW47|KWvhKhl z`QSCgz-t_=k4YLk`K3f@WZi?MsZvi&>7QtM%)q&mFl@Ld`=+Ga4PdUkLk#@z%%!dp zm5#Gz-()Up2RQBnnh)z6dY)rQNksAGmVm@CV1@)>Fqv#n#zV`E0rt~QLkTQ7fP~#@ zNC6BcW9SwWpp0*^85#sj;nK*mHMoLZ+4JMdAV_L^at(SJa1L@`hDAQP=i*z@{Lah- z5hLL)%ziP*H3*?llTd%|Nf0pL=xAUuq;m#-a3&jhjwdB>yDj4Mo(sU1+nqxU`(HT5 zIkm7n(axcmjs+MnSOn2+H?+^}WpB$ggnJO9^(QyYfu%@kZ;+uuHtV` zkU9<|xvd!RaSwVb(jeF2541dn<0b(H%Lf=R6K!)d25z+i7%XpWWmf`bZrg5Am27}B zyFnxY1_LNiZcK0-8cz;U8f3G+(af$)*0ZMkM4A-*AWJkV5n23GKxA2&7 Date: Tue, 7 Nov 2023 22:42:13 -0500 Subject: [PATCH 26/26] Add server docs --- docs/server.md | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 docs/server.md diff --git a/docs/server.md b/docs/server.md new file mode 100644 index 000000000..e7d4bb6d6 --- /dev/null +++ b/docs/server.md @@ -0,0 +1,77 @@ +# OpenAI Compatible Server + +`llama-cpp-python` offers an OpenAI API compatible web server. + +This web server can be used to serve local models and easily connect them to existing clients. + +## Setup + +### Installation + +The server can be installed by running the following command: + +```bash +pip install llama-cpp-python[server] +``` + +### Running the server + +The server can then be started by running the following command: + +```bash +python3 -m llama_cpp.server --model +``` + +### Server options + +For a full list of options, run: + +```bash +python3 -m llama_cpp.server --help +``` + +NOTE: All server options are also available as environment variables. For example, `--model` can be set by setting the `MODEL` environment variable. + +## Guides + +### Multi-modal Models + +`llama-cpp-python` supports the llava1.5 family of multi-modal models which allow the language model to +read information from both text and images. + +You'll first need to download one of the available multi-modal models in GGUF format: + +- [llava1.5 7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) +- [llava1.5 13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) + +Then when you run the server you'll need to also specify the path to the clip model used for image embedding + +```bash +python3 -m llama_cpp.server --model --clip-model-path +``` + +Then you can just use the OpenAI API as normal + +```python3 +from openai import OpenAI + +client = OpenAI(base_url="http://:/v1", api_key="sk-xxx") +response = client.chat.completions.create( + model="gpt-4-vision-preview", + messages=[ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "" + }, + }, + {"type": "text", "text": "What does the image say"}, + ], + } + ], +) +print(response) +``` \ No newline at end of file