From 4ec35390d72faba70942b9605dfcbde2bda0bdad Mon Sep 17 00:00:00 2001
From: Damian Stewart <d@damianstewart.com>
Date: Sun, 15 Oct 2023 19:35:23 +0200
Subject: [PATCH 01/26] llava v1.5 integration

---
 examples/multimodal/llava.py           |  75 +++++++++++++++++++++++++
 examples/multimodal/overfitting_lc.png | Bin 0 -> 5982 bytes
 llama_cpp/llama.py                     |   2 +
 llama_cpp/llama_cpp.py                 |  67 ++++++++++++++++++++++
 llama_cpp/server/app.py                |   4 ++
 vendor/llama.cpp                       |   2 +-
 6 files changed, 149 insertions(+), 1 deletion(-)
 create mode 100644 examples/multimodal/llava.py
 create mode 100644 examples/multimodal/overfitting_lc.png
diff --git a/examples/multimodal/llava.py b/examples/multimodal/llava.py
new file mode 100644
index 000000000..a209625c1
--- /dev/null
+++ b/examples/multimodal/llava.py
@@ -0,0 +1,75 @@
+import ctypes
+import json
+import argparse
+import os
+import array
+import sys
+
+from llama_cpp import (Llama, clip_model_load, llava_image_embed_make_with_filename, llava_image_embed_make_with_bytes,
+    llava_image_embed_p, llava_image_embed_free, llava_validate_embed_size, llava_eval_image_embed)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-m", "--model", type=str, default="../models/llava-v1.5-7b/ggml-model-q5_k.gguf")
+parser.add_argument("--mmproj", type=str, default="llava-v1.5-7b/mmproj-model-f16.gguf")
+parser.add_argument("-t", "--temp", type=float, default=0.1)
+parser.add_argument("-p", "--prompt", type=str, default="Describe this image in detail.")
+args = parser.parse_args()
+
+print(f"loading clip model from {args.mmproj}")
+if not os.path.exists(args.mmproj):
+    raise FileNotFoundError(args.mmproj)
+ctx_clip = clip_model_load(args.mmproj.encode('utf-8'))
+
+image_path = os.path.join(os.path.dirname(__file__), "overfitting_lc.png")
+if not os.path.exists(image_path):
+    raise FileNotFoundError(image_path)
+image_embed = llava_image_embed_make_with_filename(ctx_clip=ctx_clip, n_threads=1, filename=image_path.encode('utf8'))
+
+def load_image_embed_from_file_bytes(image_path: str) -> llava_image_embed_p:
+    with open(image_path, 'rb') as file:
+        image_bytes = file.read()
+        bytes_length = len(image_bytes)
+        data_array = array.array('B', image_bytes)
+        c_ubyte_ptr = (ctypes.c_ubyte * len(data_array)).from_buffer(data_array)
+        return llava_image_embed_make_with_bytes(ctx_clip=ctx_clip, n_threads=1, image_bytes=c_ubyte_ptr, image_bytes_length=bytes_length)
+
+print(f"loading llm model from {args.model}")
+if not os.path.exists(args.model):
+    raise FileNotFoundError(args.model)
+llm = Llama(model_path=args.model, n_ctx=2048, n_gpu_layers=1) # longer context needed for image embeds
+
+if not llava_validate_embed_size(llm.ctx, ctx_clip):
+    raise RuntimeError("llm and mmproj model embed size mismatch")
+
+# eval system prompt
+system_prompt = "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\n"
+llm.eval(llm.tokenize(system_prompt.encode('utf8')))
+llm.eval(llm.tokenize("\nUSER: ".encode('utf8')))
+
+# eval image embed
+n_past = ctypes.c_int(llm.n_tokens)
+n_past_p = ctypes.byref(n_past)
+llava_eval_image_embed(llm.ctx, image_embed, llm.n_batch, n_past_p)
+llm.n_tokens = n_past.value
+llava_image_embed_free(image_embed)
+
+# eval prompt
+prompt = 'Describe the visual content of this image'
+llm.eval(llm.tokenize(prompt.encode('utf8')))
+llm.eval(llm.tokenize("\nASSISTANT:".encode('utf8')))
+
+# get output
+print("\n")
+max_target_len = 256
+for i in range(max_target_len):
+    t_id = llm.sample(temp=0.1)
+    t = llm.detokenize([t_id]).decode('utf8')
+    if t == "</s>":
+        break
+    print(t, end="")
+    sys.stdout.flush()
+    llm.eval([t_id])
+
+print("\n")
+print("done")
+
diff --git a/examples/multimodal/overfitting_lc.png b/examples/multimodal/overfitting_lc.png
new file mode 100644
index 0000000000000000000000000000000000000000..591b34c68e1ca19bab4d790de6c98e70e773fdf9
GIT binary patch
literal 5982
zcmV-k7oq5hP)<h;3K|Lk000e1NJLTq00Arj009690{{R3KFzxd00093P)t-s|NsB{
z`~3gY*8g*REiEnn{{R2|{egjj1_lONT3Y}A|Mc|q{{8;=`1tzz`|pZJ^78Wi{QUp=
z_xbw#P-~y?jZ5;ETlx9<^!54v{{H{?^oE9p_xJcgUzk5$m;e6${pZ^D_VztJJ@17-
z@Q6h3hC!jBq2=Y}XJ=>p{QaDqod5j#`p>m=inHC_-Hwiq{N&cw*4FTkP}$kpNMxHz
zXPs?rZSU~*{NK*^`1@FKr03`7MPizXiHY^Gg}Au5rnAk3m%H(mS!#i<?C<r>)Z^*w
z^Wo+0Z-uZ!V3}iMV{mYA{`d0!@aUnh%khy@TXd!7>G9*|@M>ym|JvOD_4fb${zODX
z@|s{%ZJ~B{c3@y&;o;%`-QWNG`@X)u^{;|GT$cF2o%5Y${_^Yo_3!cX_uk^|@sn29
z-01w`(*NM%&CSjF)4Jp1<NfR5s<zOsuCCP7)VjL5mZij}r>B&Zl#!8<w7=DTeSPce
z>&3;zNJvP^%F0eoPWi*4^{syW>fVHeg!#y+^Pz0_xsrH{wf3`#``gEfn!We7j*Xqa
zn3$OKqj1N^$JN^Bke|U#Xr8gX)avT$$Isw%b94Xt`v2nPs;a6uS(N_s?fcrr^{9CA
zpK1EdvaGDEq@<*Ple%Ggsgk0?``yah-|CsD#>CCu!OGlwdwX47UH{kG|M2qv=IFMz
zw$jql_q&z!rgXl?+5i9mR#sNg*5&`}?fvcK_r00@?&VWcQ~TDwt+~-=eXG2@y#IfN
z|MT?!*4WU{(D3l^|M&QPkhuBBr_<Qx7#JAJ(&D<r*zD}=QBhH|v$OxDslvj-?d|RE
zgF5@z!MMWLovX-Scc|{}?m9X;|Cyivcz^%F!~c?*|L*Vqo}>Ti>izBG;Ns<Zd3ms~
zu>aB2|BRFWh>!lr%=N2#`O2&&CMKDgng6S>|GK{ag^d5Qx2>(M{QdnvKtTV_(G&{{
zsWV0LcS8QPr~X<|`A#*CYi$e$2HJp;(y6=IS8Hrmg^!Q2o{RSO000yjNkl<Zc-rlq
zX;@R&y2t-Hr$~8#446PRQ&7SvlY}YE^E_iOV?jU#DWeLgh$t$~BWO|4qBtOJain#?
zUJG@o!*Ryjb9>vzo^!9Z=k_+e=iGVx5Jc2~3E3eeS^rO)?C04l`LWi!-?i4e-WMPu
zA|fIpA|j$m%mN01Ag=~9*#v^zux1ii3c`{M6SJfM01Pt%W-M4UoNu4Y7yv?Y4RULk
z5IsF-u*{I4kqzr^Pad;2UP7(BOD%FALftaXV6z)RBO6w8GX-{LB^>MTnGJH#nVZKL
zBm;2_SQ#Gy)XGM*p18QX*Am9<7vt-P2pZY2p4tP;pX345P)N@J0Lw)_&IFBYSiKpy
z%tFpF2Adt20T~6%+QnwZ_BjO2n_?RnV$fn)eMSRd@3Tc;KLF(!b@h$|1kIb`oa6xP
z#t=6NV0Lox02`nx3+_4)GDJi~L_|bHL_|bHM7pVMiwFaG#a`hvgHVuDH8+0TMkvUq
zfZJ!!CXlWrm#FqXpFMkntog|YYry`-6Z<>@LOxp5I|kI`H@oK(&BXPNff^cp|8EE^
zx%Hdh5?FF;&fgPQa_hYhv;@|Ck69{j^sn}439KgGC6zb&;qSBr*6OVAeZ;`O4Hm-*
z03dkaIoDjpjehyp8V-wJ{ZQ(j8m+NwGEr@#pZrF{VJ$f19VNXo>EIn9lbdm2>7`#z
zy)NxrRmG+ARNCmL|D^E-ej)dAn$&FV=gpl@4E(sTjFtu-VgdkWU9(-2PhgGfIyhCw
zb@2fJ`Kws_RoLja|DXkamUu^8wbQV-WqN@a^zoh2NY}t9y<xKzoJe}s_#S*r1b6tA
z_}s2YWivQ!S)$p_jfepNn6F}OAh5=VMV$$35lhWhvFZ23&l-Os&FHsYB7G(Ld3MZG
z@0Vj&zP!HGIt5{Y-(}U$006-nyCRNS=Y|c<z%PrX*T0N9|G{Ph+w7+TEQnpb&gcO^
z;(ELN0(GuB^kx7V&C+XUi<h6DSPVE{oD#5*9<g&F0N}#fa2qulcw@;hEDi?%jE7+{
z835pO2aqrTfcc{98&hh6MfGLJN?(a(tY@lN2^@2!w6g(d4WbaREI#)xh29L{&RxJk
z|6GuT1<+a_-&4aJviLF65#_ajS#56o9R@5dDuXUIFINvNP*HKN(EuQDvD>7ANupq^
zXV&I0@k0(6yW58v<3;Gr04$tgxVt&vPG&PCb?&zoU38NS#@F6M&2Hc$$Gkkp8jAho
zM2M)Tol*}hP}NqOnE+tE&bESnvvuYDF>Z)*^UU;XV7!evcrw1>1+2Wx*`S&&Kn>ht
z=NWt<9fmc#Az2v$+{s|DpEN>JXTeYtEbw<&^&0^|(t&Uf+8dqo^B6aV67D+eo(*h^
zk55=}_XVuRi)^WZU*r>?^Q3nK*1abIz`$)Y_RK-RUZ1HhSP+K=E_DKcBwjJ?zUi0-
zo^p0?IAMr}%#+mM?h#l6=cKTPe!as)!b@0#GnWGex0z~(h3E)w9{`|&6-@U|;s;~K
zI+SNP+XJ0GfM`X*3s?@fPr^8-8E%<_a9H^fkWd5E>;}~n0?AJ~9QDJZuBRSO2LO}v
z!sj&mrejCZSmwryfJG~Mdv_azAjAg<a)z;cd~eY0AQ-N2{-XJ|J+LLA-1vta;I6@)
z_MUBE|2hbCHmea95XFs60|4{6yi3z)ZS<?}#=JGqO95zX{;9G&50wqTfCw;N+`M>T
z01yS$iDu6RhU}NI-L?VLBKLYXfO&@rh{`4EghlPL3S$Dm1yj>1S{r@;Z?$A1J=(}W
zvQB0I5Nz-~P3uaaR)!q$H^=Ro4uErzwT{kR>i7Ssg|HAA!TlNlOx6iqXf#`2eX5PH
zP;z)@3;^sq%_^g>(flh~35$x#j!g#uo2)GQjQj^$8dK`jy*F$D0Ko>~T-p{Ov^J)c
zsxI6i1|T?Sw~4Opo{xXlQdr=3S@$yln6I;Qp=kkP%vFa;&^!-S!a}60cefFMd0SKH
z3PqZxCQ^d=O<4yLRS65em!+Xw`%J^>c_peTrj%n-#A2O(kgrl$de>7A#{j?)&$YC8
z^6D}0NloFxxf_&muH}k@m$Z4eL<7h_$XZVatuMcs`~hhG+*H`%r`)%f^U%OUt3!MM
zM%V04(GYL?$sZ=0)%=QLyVFWXQOOBQe9Su834lqk@Sx_s@{P(mgjp5JyNU81M_I7Z
ztDgZNse`vc>37+8|2Ubb@~hVKO4lUO;P%LUc4vXB_ZbR6)qzq!W!s%p1D~`e#d8xq
zk$LhTbSgQ_EdhWs&#WZPfko{<VB6tGQ(V9Nu&C>)hf63xu-|j7lF>Dv{eF^A%_>_Z
zEUut!DNd0ibZuU)(E##YSabJ510=Z6#7ilnF>Fc^SX8xj7zJQflkGaCHub;#=ShK8
zQG9=H8J)473LVa5hFTvJ128GFEmFn|Ooq)e@$0Z%yMf-&1I13V@^{4bPX{o6lW>j6
zl!ldbikEVY?!ZL_kNx__dbQ~RNbF!Kv0n4(pH$tgDzb#Z9<+xoDtbmbJ=Xg=1z=u>
z?Sp(wO_}(ewq5I|f(I)Ki`wpg`6vKXZS{OGQCI%T$0{9xn|P(qRT4U^@acOh$}2V+
z0Fq2oCTc%FR^dt3`3J*U>#I}~IIMW80;8LOSA7A%rkHnRVtq?>2Hxbz0lTe!Mk;#x
z1oAqFL#>wZ0SL-W!zVUP{NV3Y^lPd%@k*bo&S9QN5{&6ftdge#5ZIKaOo-OBuT<i&
z&s)ReZQ7^8_5BHD4B;QLUMc|~D9b7>n*c49WDGH{D7ITw#sQ`nETqM9+b95>vXra|
z&{9Dz8iCDRp{YwEF!f*|@^JRo005hCPsK+f?FK%->Xgv4DN%qa2@Bi({nIf#5*2<Y
z(jCl}c+Hfx$wt+vCJxK#xcBh60*{o^BMN@cJ>SyWt&VK9JFvkVn6j{dHY+JDR2AD@
zQfwh)&u27xRvxEqvo6g7rZg;s#z{j?1vXoGE_>883|Ft?r4)TWNs~?`VDXQ|JrV<e
zAZaa2p*#?D8F<0IHKnZJs!7YnrA%fvW|y^;7Ul)tKd*q}Opogzn0GK-_{?JxgG`i4
zW4GXH;8N+;zEifr^3!_nLptnw#c8{>73P43z(Sg<_i<^~43i?()(i5!6!YIx-d`^}
z%`$aK6hL!eAuWR2CY|H&aq-NO*WprnB5hK(N@#j*(zlqD%_h0f-@in9#o3>dB4@fl
zWexo5O`fdtHUelGEGjDXaS1g%J=@AElIy>%)Z(9s%{rFn`XqI%X)B$llxo{Ke4rYb
z7c6`x&*8)?b0*vz$L17|x5<M8Xd*07ORZeR!`Jhhga^u0IXw2ADzy#vGXXRd7Mzv_
zUX6U=`}mo%ir%_V^5Mj`!uEKa67p89766(H3*u_)9ScT`?<(8%Z~HwHGVu8xr#xBe
z>s55Yg1F22*vKZq{57_wZ#m5Q8^vQvIo}6++O97%2Q(cP_+5pe;*r0~w%#AVbA6&{
zV95Lp&umLKRYA?7DQmayMyvkmBY!0o3A0@2o8Bh6JQJ3lvQhsAI9(~uePaV}zTB?c
zv!R%^YV2R9NHE*1q=Q+klxv?)g+$-9<%3Ui4eTD3v%|b8h1K-EKv@Hy=y#qc+^Xu0
zh)IWqlFQj`qtd}lD+{(Qe!fR3Sc#W9c*5WZ^8syxg_u-s-{|ByCi@St4#@na@`QE^
z5?7xuWrY`2nE+Y{OK<yQ|Ld=$8cIA^Y`68&z6p4dvsNGMu=9L&FcDKpbk?vi>f=ho
zkzVTiHKoF}F0Xfvw;e|@*+19RHoT~8YN4XJ;0M*mO^tbdG(9h7r4-qi3*>Z4%`Y4Y
z_GE<z`<YAwv>6s8$GH(DZ+Mgy&se;<2S>dc$29OJHZH-Y_f3Ock_4E7qPdg|quY9T
zH}Mr*W$Sph;d3v&7>=?JXL8D#)~499=2lE4o32V&K)2WBNLecee!=0w`)iy0R_}Ym
zSDlvlxySzVYi(^k51dz7Alj6KHGNm$)ihZ*rd@b&YAx%2_~}j8Tz>!kJ`*D&j>-Ha
zn<Ew1)~(9o-S^zO&c(w7Q(E*_bYd<oJQOMW)}DE*FReSk68_7-mxh~~nwpx1m!|Mo
zLZNB!8o#Q2Q@?VND#~B=$F)R`b9+zKzy90#^{YBMI-UiuYjWA}`7CINQ~^sIYPE4Z
zTB?gF)j(Jfhgye?M@vf+_e@Yu$yQ;$W8ePjLjr4*_>Zi1j7@9K9|^2iJcwlLvHKRO
zocX%ldStzMELz%_AW_+oSom^->@}P99f9?ljFPRc%I;hA2H<K1EQph>W&2yCG_tIb
z7+NUP-y#NHrmC%6WzZs}k!6p>YO6zUu0d!`X=J)-Ewy%igVwY?1eQ!uOT910=)E?P
z{vClOTg;nYu}AfYftPbu-R7OT@Rj{UjW3O?LK(IB`$svA`rwl<39PaER#dis8wHKm
zq`(4o^l)JJ%lIv=DcPqQ=^c&r+9f6iUQVSd70^2&GP-mc+A2rVgKt5RCW3d5p_
zftPpcbId9>QV+mqq-VWdjECIEeG+1`<aD^Yx&Cbgmb}h<)Zv{E#l+8&Q|T>nOBN6V
zFQ=lztpAS~czMN7{@?#uND)|a&J_RqFWGHQ1eSdI@V`>IyT+{DC~<hL+nLqeVdyR_
zLB6l6^_U@Cc`MCc{a{A3yP597y7E4Jm&CO!lpSoFw?c}B13+Rd0l<+K#k8b!5!Rfa
z0sg}{|291tHw!M_xLu2ak3yCiqaprwd>$k}giw~t@|ss9IOrfOsRzF>qVQ4Vo46!y
znT8VB=UU`Zq8<lHNRJuEeT&2}b9cc#$1M`wgEd^TFMXqROUW3+&^|K{ST16=Hm+!u
z!fF83`P5kVVC4^&M&^flryid!8`e_*Z0B=6PggidVYLE3<XGw+tdaOEz3UNHJECR6
zY6dv<j`uLQQpaJ<c^UbazR}wC`m1k6hfKB<mT^ue(EDqiZo?YAL?b@K<%aS{-!;By
zKO3Dv0M*{p`;&gIh*Re*F<jXS^e)puoAqjG<dUm}SGSLv{ZJ?C?!fIN0JFvJy@QQI
z)NB#O_`(vXEa&K;&3d&ovhS|A?CPm8p_z(r!z-`;ZM3i@&h`3q0&5&UYxJ^BHFsxn
zB!Tr713xUMzLvt+C?f*v4aqEx>~tozaAOIBz<P7lL58T9C0_o?z692*5{!|BKpYio
z9dTStV2#$;rI8ukhYMq)_ypFt27W|oq5fWBeFWBs#x0G^NbB0Ub8`ukz>?}MxoGra
zc5tmW_lXHCxy0z}3gm|MNeC>t@@FwiHfMXewk<Hyd028q(deZgd%QQczl0*N<dIX%
zuE&L`OO6`pG^}ss%_d3lqpotTBOb<xbr+T*q827c6lQNe6Rop7OQ8(7lC<L+t*uhK
zt}}EJmf~TO{4-ax_j+yaF5y!ImK^=-=&rErz5a(n`}n#AYhtC5DSpiH&0MRUkHgMH
ziy4%zz*5GyKN{NdIBu{1#{REMe2sJgmU5+$r>7qe4fEe?mFl{=x-E^N?XZ+S$!f%p
zE;+N~vbVSQPJh>9OZ%cD7l@}DX(ueY4h`_PKU{J(!ry!EUT(ypuF!|wUvDo-^W`(N
z3KpGd?50N^jq0xMYB{`f@7_YM?9{^%u`O3Oc0CI1-*vP5_}6{g7na1Nr$tBl^7&#W
z!$|XB(Ow$aXu6ocAhIOtX6TZzmWb5s>})QVyK|?vx3{;KmzS59S72aZU|?WeT-;`j
zgQe<>A&dplF-NyYwH-fmbLp=B>d@rmN5_sW>FV0CWAj6egQd<~G=mNr21}i%!e;-N
z&^SZ(tycZF=*Bzxldu2yi@w_A^X}OjPa9=!hn#%(EjoMl-O1N~{`p<?>Fn9FH|h*E
zXBg618d>3Cc;m-}N=>p^>dYpY7gTOW47|K<g%Zbr)Z;pk47dt7r6jkq{Gy>WvhKhl
z`QSCgz-t_=k4YLk`K3f@WZi?MsZvi&>7QtM%)q&mFl@Ld`=+Ga4PdUkLk#@z%%!dp
zm5#Gz-()Up2RQBnnh)z6dY)rQNksAGmVm@CV1@)>Fqv#n#zV`E0rt~QLkTQ7fP~#@
zNC6BcW9SwWpp0*^85#sj;nK*mHMoLZ+4JMdAV_L^at(SJa1L@`hDAQP=i*z@{Lah-
z5hLL)%ziP*H3*?llTd%|Nf0pL=xAUuq;m#-a3&jhjwdB>yDj4Mo(sU1+nqxU`(HT5
zIkm7n(axcmjs+MnSOn2+H?+^}WpB$ggnJO9^(QyYfu%@k<hQ`8bes*?CpZMSUA&7e
z=L|vJ3lKFVxJzvo)3Jc+^f3f-YSF&i5@s_UP}5U>Z;+uuHtV<a!qU%7NJ#K$#1?nJ
zxHdkg&M{~N7RSAw1L)VG{R9J?nU$E)?&wp$Tz6sBxdUKO*y0X|3<@YXzl23OH#33U
z+7VcQ*&@ew&4DFPX=FKJ1vuU{qw+3_X3X}<lfud!Fchs^22N1DC5P!)fbw{ep=d?k
z2rTE1*hBhtI%nY7xrQ+QDW`UZr1NS0(@rzA+XFM|6Y4GOgMbE~Eh2_~Bg|@Zf_!>`
zkU9<|xvd!RaSwVb(jeF2541dn<0b(H%Lf=R6K!)d25z+i7%XpWWmf`bZrg5Am27}B
zyFnxY1_LNiZcK0-8cz;U8f3G+(af$)*0ZMkM4A-*AWJkV5n23GKxA2&7<ko)Y=tQx
zvdOA|$o9Pg($v6^A+{PCrD^exh=_=Yh=_=Yh=_=Yh=_=Yh)4(Z|GvU!T`%_aIsgCw
M07*qoM6N<$f}MOg%K!iX

literal 0
HcmV?d00001

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index fdde7ea01..56bee6cb3 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -242,6 +242,8 @@ def __init__(
         lora_base: Optional[str] = None,
         lora_scale: float = 1.0,
         lora_path: Optional[str] = None,
+        # Multimodal Params
+        model_mproj_path: str = None,
         # Backend Params
         numa: bool = False,
         # Chat Format Params
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 42e57a69c..97c9b6ae6 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -82,6 +82,8 @@ def _load_shared_library(lib_base_name: str):
 
 # Misc
 c_float_p = POINTER(c_float)
+c_float_p_p = POINTER(POINTER(c_float))
+c_int_p = POINTER(c_int)
 c_uint8_p = POINTER(c_uint8)
 c_size_t_p = POINTER(c_size_t)
 
@@ -112,6 +114,11 @@ def _load_shared_library(lib_base_name: str):
 # struct llama_context;
 llama_context_p = c_void_p
 
+# struct clip_ctx;
+clip_ctx_p = c_void_p
+
+# struct llava_image_embed;
+llava_image_embed_p = c_void_p;
 
 # typedef int32_t llama_pos;
 llama_pos = c_int32
@@ -1923,3 +1930,63 @@ def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p):
 
 _lib.llama_dump_timing_info_yaml.argtypes = [ctypes.c_void_p, llama_context_p]
 _lib.llama_dump_timing_info_yaml.restype = None
+
+
+# LLAVA
+
+
+# LLAMA_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
+def clip_model_load(fname: Union[c_char_p, bytes], verbosity: c_int = 0) -> clip_ctx_p:
+    """ load mmproj model """
+    return _lib.clip_model_load(fname, verbosity)
+_lib.clip_model_load.argtypes = [c_char_p, c_int]
+_lib.clip_model_load.restype = clip_ctx_p
+
+
+# LLAMA_API void clip_free(struct clip_ctx * ctx);
+def clip_free(ctx: clip_ctx_p):
+    """ free mmproj model """
+    _lib.clip_free(ctx)
+_lib.clip_free.argtypes = [clip_ctx_p]
+_lib.clip_free.restype = None
+
+
+#LLAMA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
+def llava_validate_embed_size(ctx_llama: llama_context_p, ctx_clip: clip_ctx_p) -> c_bool:
+    """ sanity check for clip <-> llava embed size match """
+    return _lib.llava_validate_embed_size(ctx_llama, ctx_clip)
+_lib.llava_validate_embed_size.argtypes = [llama_context_p, clip_ctx_p]
+_lib.llava_validate_embed_size.restype = c_bool
+
+
+#LLAMA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
+def llava_image_embed_make_with_bytes(ctx_clip: clip_ctx_p, n_threads: Union[int,c_int], image_bytes: c_uint8_p, image_bytes_length: c_size_t) -> llava_image_embed_p:
+    """ build an image embed by interpreting image_bytes as the contents of an image file with byte size image_bytes_length.
+     supported formats (autodetected): JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC (ref https://github.com/nothings/stb) """
+    return _lib.llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length)
+_lib.llava_image_embed_make_with_bytes.argtypes = [clip_ctx_p, c_int, c_uint8_p, c_size_t]
+_lib.llava_image_embed_make_with_bytes.restype = llava_image_embed_p
+
+
+#LLAMA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
+def llava_image_embed_make_with_filename(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], filename: Union[c_char_p, bytes]) -> llava_image_embed_p:
+    """ build an image embed from a path to an image filename """
+    return _lib.llava_image_embed_make_with_filename(ctx_clip, n_threads, filename)
+_lib.llava_image_embed_make_with_filename.argtypes = [clip_ctx_p, c_int, c_char_p]
+_lib.llava_image_embed_make_with_filename.restype = llava_image_embed_p
+
+#LLAMA_API void llava_image_embed_free(struct llava_image_embed * embed);
+def llava_image_embed_free(embed: llava_image_embed_p):
+    """ free an embedding made with one of the llava_image_embed_make_ methods """
+    _lib.llava_image_embed_free(embed)
+_lib.llava_image_embed_free.argtypes = [llava_image_embed_p]
+_lib.llava_image_embed_free.restype = None
+
+#LLAMA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
+def llava_eval_image_embed(ctx: llama_context_p, image_embed: llava_image_embed_p, n_batch: c_int, n_past: c_int_p) -> c_bool:
+    """ write the image represented by embed into the llama context with batch size n_batch,
+    starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed."""
+    return _lib.llava_eval_image_embed(ctx, image_embed, n_batch, n_past)
+_lib.llava_eval_image_embed.argtypes = [llama_context_p, llava_image_embed_p, c_int, c_int_p]
+_lib.llava_eval_image_embed.restyle = c_bool
+
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 18cd47ce1..30c7a0e63 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -41,6 +41,9 @@ class Settings(BaseSettings):
         default=None,
         description="The alias of the model to use for generating completions.",
     )
+    model_mproj: str = Field(
+        description="For multimodal models (eg Llava), the path to the multimodal projector model."
+    )
     seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.")
     n_ctx: int = Field(default=2048, ge=1, description="The context size.")
     n_batch: int = Field(
@@ -345,6 +348,7 @@ def create_app(settings: Optional[Settings] = None):
     global llama
     llama = llama_cpp.Llama(
         model_path=settings.model,
+        model_mproj_path=settings.model_mproj,
         seed=settings.seed,
         n_ctx=settings.n_ctx,
         n_batch=settings.n_batch,
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index b8fe4b5cc..5a9155189 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit b8fe4b5cc9cb237ca98e5bc51b5d189e3c446d13
+Subproject commit 5a9155189945cd9aa6b98a4a340b38dc93c8d219

From 48f4228c05692936af6b9b6407ccfa8a4be789e4 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 6 Nov 2023 13:25:29 -0500
Subject: [PATCH 02/26] Point llama.cpp to fork

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 7edf0975d..6fe937b38 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "vendor/llama.cpp"]
 	path = vendor/llama.cpp
-	url = https://github.com/ggerganov/llama.cpp.git
+	url = https://github.com/damian0815/llama.cpp.git

From 61a1e5c18733c9a47dc7d35b124fc6691ad8bba1 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 6 Nov 2023 13:25:43 -0500
Subject: [PATCH 03/26] Add llava shared library target

---
 CMakeLists.txt | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c633c0797..8d063708d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,4 +41,23 @@ if (LLAMA_BUILD)
         FILES $<TARGET_RUNTIME_DLLS:llama>
         DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
     )
+    add_subdirectory(vendor/llama.cpp/examples/llava)
+    set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
+    install(
+        TARGETS llava_shared
+        LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+        RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+        ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+        FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+        RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+    )
+    # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
+    install(
+        TARGETS llava_shared
+        LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+        RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+        ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+        FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+        RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+    )
 endif()

From 46ce32326f9005f14ff39020e43b5a0980e2448e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 6 Nov 2023 13:25:51 -0500
Subject: [PATCH 04/26] Fix type

---
 llama_cpp/llama_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index a4d21004f..e0dbdf854 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -1078,7 +1078,7 @@ def llama_batch_get_one(
     tokens,  # type: Array[llama_token]
     n_tokens: Union[c_int, int],
     pos_0: Union[llama_pos, int],
-    seq_id: llama_seq_id,
+    seq_id: Union[llama_seq_id, int],
 ) -> llama_batch:
     return _lib.llama_batch_get_one(tokens, n_tokens, pos_0, seq_id)
 

From 0d8a91b7944492f28be4cf007ee4a6b4c83a412c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 6 Nov 2023 13:26:15 -0500
Subject: [PATCH 05/26] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 2833a6f63..22f43fca0 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 2833a6f63c1b87c7f4ac574bcf7a15a2f3bf3ede
+Subproject commit 22f43fca0ac2237766f825a8ab4aa2d5e19238d0

From 0c950665103e08558be03e4725b387aadfa1bf03 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 6 Nov 2023 13:26:25 -0500
Subject: [PATCH 06/26] Add llava api

---
 llama_cpp/llava_cpp.py | 232 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 232 insertions(+)
 create mode 100644 llama_cpp/llava_cpp.py

diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py
new file mode 100644
index 000000000..5dc4b4201
--- /dev/null
+++ b/llama_cpp/llava_cpp.py
@@ -0,0 +1,232 @@
+import sys
+import os
+import ctypes
+from ctypes import (
+    c_bool,
+    c_char_p,
+    c_int,
+    c_int8,
+    c_int32,
+    c_uint8,
+    c_uint32,
+    c_size_t,
+    c_float,
+    c_double,
+    c_void_p,
+    POINTER,
+    _Pointer,  # type: ignore
+    Structure,
+    Array,
+)
+import pathlib
+from typing import List, Union
+
+import llama_cpp.llama_cpp as llama_cpp
+
+# Load the library
+def _load_shared_library(lib_base_name: str):
+    # Construct the paths to the possible shared library names
+    _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__)))
+    # Searching for the library in the current directory under the name "libllama" (default name
+    # for llamacpp) and "llama" (default name for this repo)
+    _lib_paths: List[pathlib.Path] = []
+    # Determine the file extension based on the platform
+    if sys.platform.startswith("linux"):
+        _lib_paths += [
+            _base_path / f"lib{lib_base_name}.so",
+        ]
+    elif sys.platform == "darwin":
+        _lib_paths += [
+            _base_path / f"lib{lib_base_name}.so",
+            _base_path / f"lib{lib_base_name}.dylib",
+        ]
+    elif sys.platform == "win32":
+        _lib_paths += [
+            _base_path / f"{lib_base_name}.dll",
+            _base_path / f"lib{lib_base_name}.dll",
+        ]
+    else:
+        raise RuntimeError("Unsupported platform")
+
+    if "LLAMA_CPP_LIB" in os.environ:
+        lib_base_name = os.environ["LLAMA_CPP_LIB"]
+        _lib = pathlib.Path(lib_base_name)
+        _base_path = _lib.parent.resolve()
+        _lib_paths = [_lib.resolve()]
+
+    cdll_args = dict()  # type: ignore
+    # Add the library directory to the DLL search path on Windows (if needed)
+    if sys.platform == "win32" and sys.version_info >= (3, 8):
+        os.add_dll_directory(str(_base_path))
+        if "CUDA_PATH" in os.environ:
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
+        cdll_args["winmode"] = ctypes.RTLD_GLOBAL
+
+    # Try to load the shared library, handling potential errors
+    for _lib_path in _lib_paths:
+        if _lib_path.exists():
+            try:
+                return ctypes.CDLL(str(_lib_path), **cdll_args)
+            except Exception as e:
+                raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
+
+    raise FileNotFoundError(
+        f"Shared library with base name '{lib_base_name}' not found"
+    )
+
+
+# Specify the base name of the shared library to load
+_libllava_base_name = "llava"
+
+# Load the library
+_libllava = _load_shared_library(_libllava_base_name)
+
+
+################################################
+# llava.h
+################################################
+
+# struct clip_ctx;
+clip_ctx_p = c_void_p
+
+# struct llava_image_embed {
+#     float * embed;
+#     int n_image_pos;
+# };
+class llava_image_embed(Structure):
+    _fields_ = [
+        ("embed", POINTER(c_float)),
+        ("n_image_pos", c_int),
+    ]
+
+# /** sanity check for clip <-> llava embed size match */
+# LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
+def llava_validate_embed_size(ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p) -> bool:
+    return _libllava.llava_validate_embed_size(ctx_llama, ctx_clip)
+
+_libllava.llava_validate_embed_size.argtypes = [llama_cpp.llama_context_p, clip_ctx_p]
+_libllava.llava_validate_embed_size.restype = c_bool
+
+# /** build an image embed from image file bytes */
+# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
+def llava_image_embed_make_with_bytes(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_bytes: bytes, image_bytes_length: Union[c_int, int]) -> "_Pointer[llava_image_embed]":
+    return _libllava.llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length)
+
+_libllava.llava_image_embed_make_with_bytes.argtypes = [clip_ctx_p, c_int, POINTER(c_uint8), c_int]
+_libllava.llava_image_embed_make_with_bytes.restype = POINTER(llava_image_embed)
+
+# /** build an image embed from a path to an image filename */
+# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
+def llava_image_embed_make_with_filename(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes) -> "_Pointer[llava_image_embed]":
+    return _libllava.llava_image_embed_make_with_filename(ctx_clip, n_threads, image_path)
+
+_libllava.llava_image_embed_make_with_filename.argtypes = [clip_ctx_p, c_int, c_char_p]
+_libllava.llava_image_embed_make_with_filename.restype = POINTER(llava_image_embed)
+
+# LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
+# /** free an embedding made with llava_image_embed_make_* */
+def llava_image_embed_free(embed: "_Pointer[llava_image_embed]"):
+    return _libllava.llava_image_embed_free(embed)
+
+_libllava.llava_image_embed_free.argtypes = [POINTER(llava_image_embed)]
+_libllava.llava_image_embed_free.restype = None
+
+# /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
+# LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
+def llava_eval_image_embed(ctx_llama: llama_cpp.llama_context_p, embed: "_Pointer[llava_image_embed]", n_batch: Union[c_int, int], n_past: Union[c_int, int]) -> bool:
+    return _libllava.llava_eval_image_embed(ctx_llama, embed, n_batch, n_past)
+
+_libllava.llava_eval_image_embed.argtypes = [llama_cpp.llama_context_p, POINTER(llava_image_embed), c_int, POINTER(c_int)]
+_libllava.llava_eval_image_embed.restype = c_bool
+
+
+################################################
+# clip.h
+################################################
+
+
+# struct clip_vision_hparams {
+#     int32_t image_size;
+#     int32_t patch_size;
+#     int32_t hidden_size;
+#     int32_t n_intermediate;
+#     int32_t projection_dim;
+#     int32_t n_head;
+#     int32_t n_layer;
+#     float eps;
+# };
+class clip_vision_hparams(Structure):
+    _fields_ = [
+        ("image_size", c_int32),
+        ("patch_size", c_int32),
+        ("hidden_size", c_int32),
+        ("n_intermediate", c_int32),
+        ("projection_dim", c_int32),
+        ("n_head", c_int32),
+        ("n_layer", c_int32),
+        ("eps", c_float),
+    ]
+
+# /** load mmproj model */
+# CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
+def clip_model_load(fname: bytes, verbosity: Union[c_int, int]) -> clip_ctx_p:
+    return _libllava.clip_model_load(fname, verbosity)
+
+_libllava.clip_model_load.argtypes = [c_char_p, c_int]
+_libllava.clip_model_load.restype = clip_ctx_p
+
+# /** free mmproj model */
+# CLIP_API void clip_free(struct clip_ctx * ctx);
+def clip_free(ctx: clip_ctx_p):
+    return _libllava.clip_free(ctx)
+
+_libllava.clip_free.argtypes = [clip_ctx_p]
+_libllava.clip_free.restype = None
+
+# size_t clip_embd_nbytes(const struct clip_ctx * ctx);
+# int clip_n_patches(const struct clip_ctx * ctx);
+# int clip_n_mmproj_embd(const struct clip_ctx * ctx);
+
+# // RGB uint8 image
+# struct clip_image_u8 {
+#     int nx;
+#     int ny;
+#     uint8_t * data = NULL;
+#     size_t size;
+# };
+
+# // RGB float32 image (NHWC)
+# // Memory layout: RGBRGBRGB...
+# struct clip_image_f32 {
+#     int nx;
+#     int ny;
+#     float * data = NULL;
+#     size_t size;
+# };
+
+# struct clip_image_u8_batch {
+#     struct clip_image_u8 * data;
+#     size_t size;
+# };
+
+# struct clip_image_f32_batch {
+#     struct clip_image_f32 * data;
+#     size_t size;
+# };
+
+# struct clip_image_u8 * make_clip_image_u8();
+# struct clip_image_f32 * make_clip_image_f32();
+# CLIP_API void clip_image_u8_free(clip_image_u8 * img);
+# CLIP_API void clip_image_f32_free(clip_image_f32 * img);
+# CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
+# /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
+# CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
+
+# bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
+# bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
+
+# bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
+#                              float * vec);
+
+# bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
\ No newline at end of file

From 9406d631e02f53badd31199f82681f0dfc3296bc Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 6 Nov 2023 13:55:28 -0500
Subject: [PATCH 07/26] Revert changes to llama and llama_cpp

---
 llama_cpp/llama.py     | 935 +++++++++++++++++++++++++++--------------
 llama_cpp/llama_cpp.py |  69 +--
 2 files changed, 632 insertions(+), 372 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index ba70f2060..6dc113ac9 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -208,6 +208,506 @@ def __call__(
         return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
 
 
+class _LlamaModel:
+    """Intermediate Python wrapper for a llama.cpp llama_model.
+
+    NOTE: For stability it's recommended you use the Llama class instead."""
+
+    _llama_free_model = llama_cpp._lib.llama_free_model  # type: ignore
+
+    def __init__(
+        self,
+        *,
+        path_model: str,
+        params: llama_cpp.llama_model_params,
+        verbose: bool = True,
+    ):
+        self.path_model = path_model
+        self.params = params
+        self.verbose = verbose
+
+        if not os.path.exists(path_model):
+            raise ValueError(f"Model path does not exist: {path_model}")
+
+        with suppress_stdout_stderr(disable=self.verbose):
+            self.model = llama_cpp.llama_load_model_from_file(
+                self.path_model.encode("utf-8"), self.params
+            )
+
+    def __del__(self):
+        with suppress_stdout_stderr(disable=self.verbose):
+            if self.model is not None:
+                self._llama_free_model(self.model)
+                self.model = None
+
+    def vocab_type(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_vocab_type(self.model)
+
+    def n_vocab(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_n_vocab(self.model)
+
+    def n_ctx_train(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_n_ctx_train(self.model)
+
+    def n_embd(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_n_embd(self.model)
+
+    def rope_freq_scale_train(self) -> float:
+        assert self.model is not None
+        return llama_cpp.llama_rope_freq_scale_train(self.model)
+
+    def desc(self) -> str:
+        assert self.model is not None
+        buf = ctypes.create_string_buffer(1024)
+        llama_cpp.llama_model_desc(self.model, buf, 1024)  # type: ignore
+        return buf.value.decode("utf-8")
+
+    def size(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_model_size(self.model)
+
+    def n_params(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_model_n_params(self.model)
+
+    def get_tensor(self, name: str) -> ctypes.c_void_p:
+        assert self.model is not None
+        return llama_cpp.llama_get_model_tensor(self.model, name.encode("utf-8"))
+
+    def apply_lora_from_file(
+        self,
+        lora_path: str,
+        scale: float,
+        path_base_model: Optional[str],
+        n_threads: int,
+    ):
+        assert self.model is not None
+        return llama_cpp.llama_model_apply_lora_from_file(
+            self.model,
+            lora_path.encode("utf-8"),
+            scale,
+            path_base_model.encode("utf-8")
+            if path_base_model is not None
+            else llama_cpp.c_char_p(0),
+            n_threads,
+        )
+
+    # Vocab
+
+    def token_get_text(self, token: int) -> str:
+        # TODO: Fix
+        assert self.model is not None
+        return llama_cpp.llama_token_get_text(self.model, token).decode("utf-8")
+
+    def token_get_score(self, token: int) -> float:
+        assert self.model is not None
+        return llama_cpp.llama_token_get_score(self.model, token)
+
+    def token_get_type(self, token: int) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_token_get_type(self.model, token)
+
+    # Special tokens
+
+    def token_bos(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_token_bos(self.model)
+
+    def token_eos(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_token_eos(self.model)
+
+    def token_nl(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_token_nl(self.model)
+
+    def token_prefix(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_token_prefix(self.model)
+
+    def token_middle(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_token_middle(self.model)
+
+    def token_suffix(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_token_suffix(self.model)
+
+    def token_eot(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_token_eot(self.model)
+
+    # Tokenization
+
+    def tokenize(self, text: bytes, add_bos: bool, special: bool):
+        assert self.model is not None
+        n_ctx = self.n_ctx_train()
+        tokens = (llama_cpp.llama_token * n_ctx)()
+        n_tokens = llama_cpp.llama_tokenize(
+            self.model, text, len(text), tokens, n_ctx, add_bos, special
+        )
+        if n_tokens < 0:
+            n_tokens = abs(n_tokens)
+            tokens = (llama_cpp.llama_token * n_tokens)()
+            n_tokens = llama_cpp.llama_tokenize(
+                self.model, text, len(text), tokens, n_tokens, add_bos, special
+            )
+            if n_tokens < 0:
+                raise RuntimeError(
+                    f'Failed to tokenize: text="{text}" n_tokens={n_tokens}'
+                )
+        return list(tokens[:n_tokens])
+
+    def token_to_piece(self, token: int) -> bytes:
+        assert self.model is not None
+        buf = ctypes.create_string_buffer(32)
+        llama_cpp.llama_token_to_piece(self.model, token, buf, 32)  # type: ignore
+        return bytes(buf)
+
+    def detokenize(self, tokens: List[int]) -> bytes:
+        assert self.model is not None
+        output = b""
+        size = 32
+        buffer = (ctypes.c_char * size)()
+        for token in tokens:
+            n = llama_cpp.llama_token_to_piece(
+                self.model, llama_cpp.llama_token(token), buffer, size
+            )
+            assert n <= size
+            output += bytes(buffer[:n])
+        # NOTE: Llama1 models automatically added a space at the start of the prompt
+        # this line removes a leading space if the first token is a beginning of sentence token
+        return (
+            output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
+        )
+
+    @staticmethod
+    def default_params():
+        """Get the default llama_model_params."""
+        return llama_cpp.llama_model_default_params()
+
+
+class _LlamaContext:
+    """Intermediate Python wrapper for a llama.cpp llama_context.
+
+    NOTE: For stability it's recommended you use the Llama class instead."""
+
+    _llama_free = llama_cpp._lib.llama_free  # type: ignore
+
+    def __init__(
+        self,
+        *,
+        model: _LlamaModel,
+        params: llama_cpp.llama_context_params,
+        verbose: bool = True,
+    ):
+        self.model = model
+        self.params = params
+        self.verbose = verbose
+
+        with suppress_stdout_stderr(disable=self.verbose):
+            self.ctx = llama_cpp.llama_new_context_with_model(
+                self.model.model, self.params
+            )
+
+    def __del__(self):
+        with suppress_stdout_stderr(disable=self.verbose):
+            if self.ctx is not None:
+                self._llama_free(self.ctx)
+                self.ctx = None
+
+    def n_ctx(self) -> int:
+        assert self.ctx is not None
+        return llama_cpp.llama_n_ctx(self.ctx)
+
+    def kv_cache_clear(self):
+        assert self.ctx is not None
+        llama_cpp.llama_kv_cache_clear(self.ctx)
+
+    def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
+        assert self.ctx is not None
+        llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1)
+
+    def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
+        assert self.ctx is not None
+        llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)
+
+    def kv_cache_seq_keep(self, seq_id: int):
+        assert self.ctx is not None
+        llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id)
+
+    def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
+        assert self.ctx is not None
+        llama_cpp.llama_kv_cache_seq_shift(self.ctx, seq_id, p0, p1, shift)
+
+    def get_state_size(self) -> int:
+        assert self.ctx is not None
+        return llama_cpp.llama_get_state_size(self.ctx)
+
+    # TODO: copy_state_data
+
+    # TODO: set_state_data
+
+    # TODO: llama_load_session_file
+
+    # TODO: llama_save_session_file
+
+    def decode(self, batch: "_LlamaBatch"):
+        assert self.ctx is not None
+        assert batch.batch is not None
+        return_code = llama_cpp.llama_decode(
+            ctx=self.ctx,
+            batch=batch.batch,
+        )
+        if return_code != 0:
+            raise RuntimeError(f"llama_decode returned {return_code}")
+
+    def set_n_threads(self, n_threads: int, n_threads_batch: int):
+        assert self.ctx is not None
+        llama_cpp.llama_set_n_threads(self.ctx, n_threads, n_threads_batch)
+
+    def get_logits(self):
+        assert self.ctx is not None
+        return llama_cpp.llama_get_logits(self.ctx)
+
+    def get_logits_ith(self, i: int):
+        assert self.ctx is not None
+        return llama_cpp.llama_get_logits_ith(self.ctx, i)
+
+    def get_embeddings(self):
+        assert self.ctx is not None
+        return llama_cpp.llama_get_embeddings(self.ctx)
+
+    # Sampling functions
+
+    def set_rng_seed(self, seed: int):
+        assert self.ctx is not None
+        llama_cpp.llama_set_rng_seed(self.ctx, seed)
+
+    def sample_repetition_penalties(
+        self,
+        candidates: "_LlamaTokenDataArray",
+        last_tokens_data: "llama_cpp.Array[llama_cpp.llama_token]",
+        penalty_last_n: int,
+        penalty_repeat: float,
+        penalty_freq: float,
+        penalty_present: float,
+    ):
+        assert self.ctx is not None
+        llama_cpp.llama_sample_repetition_penalties(
+            self.ctx,
+            ctypes.byref(candidates.candidates),  # type: ignore
+            last_tokens_data,
+            penalty_last_n,
+            penalty_repeat,
+            penalty_freq,
+            penalty_present,
+        )
+
+    def sample_classifier_free_guidance(
+        self,
+        candidates: "_LlamaTokenDataArray",
+        guidance_ctx: "_LlamaContext",
+        scale: float,
+    ):
+        assert self.ctx is not None
+        assert guidance_ctx.ctx is not None
+        llama_cpp.llama_sample_classifier_free_guidance(
+            self.ctx,
+            ctypes.byref(candidates.candidates),  # type: ignore
+            guidance_ctx.ctx,
+            scale,
+        )
+
+    def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
+        assert self.ctx is not None
+        llama_cpp.llama_sample_softmax(
+            self.ctx,
+            ctypes.byref(candidates.candidates),  # type: ignore
+        )
+
+    def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int):
+        assert self.ctx is not None
+        llama_cpp.llama_sample_top_k(
+            self.ctx, ctypes.byref(candidates.candidates), k, min_keep  # type: ignore
+        )
+
+    def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
+        assert self.ctx is not None
+        llama_cpp.llama_sample_top_p(
+            self.ctx, ctypes.byref(candidates.candidates), p, min_keep  # type: ignore
+        )
+
+    def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
+        assert self.ctx is not None
+        llama_cpp.llama_sample_min_p(
+            self.ctx, ctypes.byref(candidates.candidates), p, min_keep  # type: ignore
+        )
+
+    def sample_tail_free(
+        self, candidates: "_LlamaTokenDataArray", z: float, min_keep: int
+    ):
+        assert self.ctx is not None
+        llama_cpp.llama_sample_tail_free(
+            self.ctx, ctypes.byref(candidates.candidates), z, min_keep  # type: ignore
+        )
+
+    def sample_typical(
+        self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int
+    ):
+        assert self.ctx is not None
+        llama_cpp.llama_sample_typical(
+            self.ctx, ctypes.byref(candidates.candidates), p, min_keep  # type: ignore
+        )
+
+    def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float):
+        assert self.ctx is not None
+        llama_cpp.llama_sample_temp(
+            self.ctx, ctypes.byref(candidates.candidates), temp  # type: ignore
+        )
+
+    def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar):
+        assert self.ctx is not None
+        assert grammar.grammar is not None
+        llama_cpp.llama_sample_grammar(
+            self.ctx,
+            ctypes.byref(candidates.candidates),  # type: ignore
+            grammar.grammar,
+        )
+
+    def sample_token_mirostat(
+        self,
+        candidates: "_LlamaTokenDataArray",
+        tau: float,
+        eta: float,
+        m: int,
+        mu: float,
+    ) -> int:
+        assert self.ctx is not None
+        return llama_cpp.llama_sample_token_mirostat(
+            self.ctx,
+            ctypes.byref(candidates.candidates),  # type: ignore
+            tau,
+            eta,
+            m,
+            ctypes.pointer(ctypes.c_float(mu)),
+        )
+
+    def sample_token_mirostat_v2(
+        self, candidates: "_LlamaTokenDataArray", tau: float, eta: float, mu: float
+    ) -> int:
+        assert self.ctx is not None
+        return llama_cpp.llama_sample_token_mirostat_v2(
+            self.ctx,
+            ctypes.byref(candidates.candidates),  # type: ignore
+            tau,
+            eta,
+            ctypes.pointer(ctypes.c_float(mu)),
+        )
+
+    def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int:
+        assert self.ctx is not None
+        return llama_cpp.llama_sample_token_greedy(
+            self.ctx,
+            ctypes.byref(candidates.candidates),  # type: ignore
+        )
+
+    def sample_token(self, candidates: "_LlamaTokenDataArray") -> int:
+        assert self.ctx is not None
+        return llama_cpp.llama_sample_token(
+            self.ctx,
+            ctypes.byref(candidates.candidates),  # type: ignore
+        )
+
+    # Grammar
+    def grammar_accept_token(self, grammar: LlamaGrammar, token: int):
+        assert self.ctx is not None
+        assert grammar.grammar is not None
+        llama_cpp.llama_grammar_accept_token(self.ctx, grammar.grammar, token)
+
+    def reset_timings(self):
+        assert self.ctx is not None
+        llama_cpp.llama_reset_timings(self.ctx)
+
+    def print_timings(self):
+        assert self.ctx is not None
+        llama_cpp.llama_print_timings(self.ctx)
+
+    # Utility functions
+    @staticmethod
+    def default_params():
+        """Get the default llama_context_params."""
+        return llama_cpp.llama_context_default_params()
+
+
+class _LlamaBatch:
+    _llama_batch_free = llama_cpp._lib.llama_batch_free  # type: ignore
+
+    def __init__(
+        self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True
+    ):
+        self.n_tokens = n_tokens
+        self.embd = embd
+        self.n_seq_max = n_seq_max
+        self.verbose = verbose
+
+        with suppress_stdout_stderr(disable=self.verbose):
+            self.batch = llama_cpp.llama_batch_init(
+                self.n_tokens, self.embd, self.n_seq_max
+            )
+
+    def __del__(self):
+        with suppress_stdout_stderr(disable=self.verbose):
+            if self.batch is not None:
+                self._llama_batch_free(self.batch)
+                self.batch = None
+
+    def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool):
+        assert self.batch is not None
+        n_tokens = len(batch)
+        self.batch.n_tokens = n_tokens
+        for i in range(n_tokens):
+            self.batch.token[i] = batch[i]
+            self.batch.pos[i] = n_past + i
+            self.batch.seq_id[i][0] = 0
+            self.batch.n_seq_id[i] = 1
+            self.batch.logits[i] = logits_all
+        self.batch.logits[n_tokens - 1] = True
+
+
+class _LlamaTokenDataArray:
+    def __init__(self, *, n_vocab: int):
+        self.n_vocab = n_vocab
+        self.candidates_data = np.array(
+            [],
+            dtype=np.dtype(
+                [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True
+            ),
+        )
+        self.candidates_data.resize(3, self.n_vocab, refcheck=False)
+        self.candidates = llama_cpp.llama_token_data_array(
+            data=self.candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p),
+            size=self.n_vocab,
+            sorted=False,
+        )
+        self.default_candidates_data_id = np.arange(self.n_vocab, dtype=np.intc)
+        self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single)
+
+    def copy_logits(self, logits: npt.NDArray[np.single]):
+        self.candidates_data["id"][:] = self.default_candidates_data_id
+        self.candidates_data["logit"][:] = logits
+        self.candidates_data["p"][:] = self.default_candidates_data_p
+        self.candidates.data = self.candidates_data.ctypes.data_as(
+            llama_cpp.llama_token_data_p
+        )
+        self.candidates.sorted = llama_cpp.c_bool(False)
+        self.candidates.size = llama_cpp.c_size_t(self.n_vocab)
+
+
 class Llama:
     """High-level Python wrapper for a llama.cpp model."""
 
@@ -248,8 +748,6 @@ def __init__(
         lora_base: Optional[str] = None,
         lora_scale: float = 1.0,
         lora_path: Optional[str] = None,
-        # Multimodal Params
-        model_mproj_path: str = None,
         # Backend Params
         numa: bool = False,
         # Chat Format Params
@@ -314,7 +812,9 @@ def __init__(
         self._p_tensor_split = None
         if self.tensor_split is not None:
             if len(self.tensor_split) > llama_cpp.LLAMA_MAX_DEVICES:
-                raise ValueError(f"Attempt to split tensors that exceed maximum supported devices. Current LLAMA_MAX_DEVICES={llama_cpp.LLAMA_MAX_DEVICES}")
+                raise ValueError(
+                    f"Attempt to split tensors that exceed maximum supported devices. Current LLAMA_MAX_DEVICES={llama_cpp.LLAMA_MAX_DEVICES}"
+                )
             # Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
             FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES
             self._c_tensor_split = FloatArray(
@@ -338,7 +838,9 @@ def __init__(
         self.context_params.n_threads = self.n_threads
         self.context_params.n_threads_batch = self.n_threads_batch
         self.context_params.rope_scaling_type = (
-            rope_scaling_type if rope_scaling_type is not None else llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED
+            rope_scaling_type
+            if rope_scaling_type is not None
+            else llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED
         )
         self.context_params.rope_freq_base = (
             rope_freq_base if rope_freq_base != 0.0 else 0
@@ -358,9 +860,7 @@ def __init__(
         self.context_params.yarn_beta_slow = (
             yarn_beta_slow if yarn_beta_slow != 0.0 else 0
         )
-        self.context_params.yarn_orig_ctx = (
-            yarn_orig_ctx if yarn_orig_ctx != 0 else 0
-        )
+        self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
         self.context_params.mul_mat_q = mul_mat_q
         self.context_params.f16_kv = f16_kv
         self.context_params.logits_all = logits_all
@@ -378,32 +878,28 @@ def __init__(
         if not os.path.exists(model_path):
             raise ValueError(f"Model path does not exist: {model_path}")
 
-        with suppress_stdout_stderr(disable=self.verbose):
-            self.model = llama_cpp.llama_load_model_from_file(
-                self.model_path.encode("utf-8"), self.model_params
-            )
-        assert self.model is not None
-
-        with suppress_stdout_stderr(disable=self.verbose):
-            self.ctx = llama_cpp.llama_new_context_with_model(
-                self.model, self.context_params
-            )
+        self._model = _LlamaModel(
+            path_model=self.model_path, params=self.model_params, verbose=self.verbose
+        )
 
-        assert self.ctx is not None
+        self._ctx = _LlamaContext(
+            model=self._model,
+            params=self.context_params,
+            verbose=self.verbose,
+        )
 
-        with suppress_stdout_stderr(disable=self.verbose):
-            self.batch = llama_cpp.llama_batch_init(
-                self.n_batch, 0, 1
-            )
+        self._batch = _LlamaBatch(
+            n_tokens=self.n_batch,
+            embd=0,
+            n_seq_max=self.context_params.n_ctx,
+            verbose=self.verbose,
+        )
 
         if self.lora_path:
-            if llama_cpp.llama_model_apply_lora_from_file(
-                self.model,
-                self.lora_path.encode("utf-8"),
+            if self._model.apply_lora_from_file(
+                self.lora_path,
                 self.lora_scale,
-                self.lora_base.encode("utf-8")
-                if self.lora_base is not None
-                else llama_cpp.c_char_p(0),
+                self.lora_base,
                 self.n_threads,
             ):
                 raise RuntimeError(
@@ -417,25 +913,11 @@ def __init__(
 
         self._n_vocab = self.n_vocab()
         self._n_ctx = self.n_ctx()
-        size = self._n_vocab
-        sorted = False
-        self._candidates_data = np.array(
-            [],
-            dtype=np.dtype(
-                [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True
-            ),
-        )
-        self._candidates_data.resize(3, self._n_vocab, refcheck=False)
-        candidates = llama_cpp.llama_token_data_array(
-            data=self._candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p),
-            size=size,
-            sorted=sorted,
-        )
-        self._candidates = candidates
+
         self._token_nl = self.token_nl()
         self._token_eos = self.token_eos()
-        self._candidates_data_id = np.arange(self._n_vocab, dtype=np.intc)  # type: ignore
-        self._candidates_data_p = np.zeros(self._n_vocab, dtype=np.single)
+
+        self._candidates = _LlamaTokenDataArray(n_vocab=self._n_vocab)
 
         self.n_tokens = 0
         self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
@@ -443,6 +925,16 @@ def __init__(
             (n_ctx, self._n_vocab), dtype=np.single
         )
 
+    @property
+    def ctx(self) -> llama_cpp.llama_context_p:
+        assert self._ctx.ctx is not None
+        return self._ctx.ctx
+
+    @property
+    def model(self) -> llama_cpp.llama_model_p:
+        assert self._model.model is not None
+        return self._model.model
+
     @property
     def _input_ids(self) -> npt.NDArray[np.intc]:
         return self.input_ids[: self.n_tokens]
@@ -462,7 +954,9 @@ def eval_logits(self) -> Deque[List[float]]:
             maxlen=self._n_ctx if self.context_params.logits_all else 1,
         )
 
-    def tokenize(self, text: bytes, add_bos: bool = True, special: bool = False) -> List[int]:
+    def tokenize(
+        self, text: bytes, add_bos: bool = True, special: bool = False
+    ) -> List[int]:
         """Tokenize a string.
 
         Args:
@@ -474,35 +968,7 @@ def tokenize(self, text: bytes, add_bos: bool = True, special: bool = False) ->
         Returns:
             A list of tokens.
         """
-        assert self.model is not None
-        n_ctx = self._n_ctx
-        tokens = (llama_cpp.llama_token * n_ctx)()
-        n_tokens = llama_cpp.llama_tokenize(
-            self.model,
-            text,
-            len(text),
-            tokens,
-            n_ctx,
-            add_bos,
-            special
-        )
-        if n_tokens < 0:
-            n_tokens = abs(n_tokens)
-            tokens = (llama_cpp.llama_token * n_tokens)()
-            n_tokens = llama_cpp.llama_tokenize(
-                self.model,
-                text,
-                len(text),
-                tokens,
-                n_tokens,
-                add_bos,
-                special
-            )
-            if n_tokens < 0:
-                raise RuntimeError(
-                    f'Failed to tokenize: text="{text}" n_tokens={n_tokens}'
-                )
-        return list(tokens[:n_tokens])
+        return self._model.tokenize(text, add_bos, special)
 
     def detokenize(self, tokens: List[int]) -> bytes:
         """Detokenize a list of tokens.
@@ -513,21 +979,7 @@ def detokenize(self, tokens: List[int]) -> bytes:
         Returns:
             The detokenized string.
         """
-        assert self.model is not None
-        output = b""
-        size = 32
-        buffer = (ctypes.c_char * size)()
-        for token in tokens:
-            n = llama_cpp.llama_token_to_piece(
-                self.model, llama_cpp.llama_token(token), buffer, size
-            )
-            assert n <= size
-            output += bytes(buffer[:n])
-        # NOTE: Llama1 models automatically added a space at the start of the prompt
-        # this line removes a leading space if the first token is a beginning of sentence token
-        return (
-            output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
-        )
+        return self._model.detokenize(tokens)
 
     def set_cache(self, cache: Optional[BaseLlamaCache]):
         """Set the cache.
@@ -547,28 +999,18 @@ def eval(self, tokens: Sequence[int]):
         Args:
             tokens: The list of tokens to evaluate.
         """
-        assert self.ctx is not None
-        assert self.batch is not None
+        assert self._ctx.ctx is not None
+        assert self._batch.batch is not None
         n_ctx = self._n_ctx
         for i in range(0, len(tokens), self.n_batch):
             batch = tokens[i : min(len(tokens), i + self.n_batch)]
-            n_past = min(n_ctx - len(batch), len(self._input_ids))
+            n_past = min(n_ctx - len(batch), self.n_tokens)
             n_tokens = len(batch)
-            llama_cpp.llama_kv_cache_seq_rm(self.ctx, -1, n_past, -1)
-            self.batch.n_tokens = n_tokens
-            for i in range(n_tokens):
-                self.batch.token[i] = batch[i]
-                self.batch.pos[i] = n_past + i
-                self.batch.seq_id[i][0] = 0
-                self.batch.n_seq_id[i] = 1
-                self.batch.logits[i] = True if self.context_params.logits_all else False
-            self.batch.logits[n_tokens - 1] = True
-            return_code = llama_cpp.llama_decode(
-                ctx=self.ctx,
-                batch=self.batch,
+            self._ctx.kv_cache_seq_rm(-1, n_past, -1)
+            self._batch.set_batch(
+                batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
             )
-            if return_code != 0:
-                raise RuntimeError(f"llama_decode returned {return_code}")
+            self._ctx.decode(self._batch)
             # Save tokens
             self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch
             # Save logits
@@ -579,195 +1021,106 @@ def eval(self, tokens: Sequence[int]):
             )  # NOTE: Only save the last token logits if logits_all is False
             self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape(
                 -1
-            )[:] = llama_cpp.llama_get_logits(self.ctx)[: rows * cols]
+            )[:] = self._ctx.get_logits()[: rows * cols]
             # Update n_tokens
             self.n_tokens += n_tokens
 
-    def _sample(
+    def sample(
         self,
-        last_n_tokens_data,  # type: llama_cpp.Array[llama_cpp.llama_token]
-        last_n_tokens_size: int,
-        top_k: int,
-        top_p: float,
-        temp: float,
-        tfs_z: float,
-        repeat_penalty: float,
-        frequency_penalty: float,
-        presence_penalty: float,
-        mirostat_mode: float,
-        mirostat_tau: float,
-        mirostat_eta: float,
+        top_k: int = 40,
+        top_p: float = 0.95,
+        temp: float = 0.80,
+        repeat_penalty: float = 1.1,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_eta: float = 0.1,
+        mirostat_tau: float = 5.0,
         penalize_nl: bool = True,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
     ):
-        assert self.ctx is not None
+        """Sample a token from the model.
+
+        Args:
+            top_k: The top-k sampling parameter.
+            top_p: The top-p sampling parameter.
+            temp: The temperature parameter.
+            repeat_penalty: The repeat penalty parameter.
+
+        Returns:
+            The sampled token.
+        """
+        assert self._ctx is not None
         assert self.n_tokens > 0
+        last_n_tokens_data = [llama_cpp.llama_token(0)] * max(
+            0, self.last_n_tokens_size - self.n_tokens
+        ) + self._input_ids[-self.last_n_tokens_size :].tolist()
+        last_n_tokens_size = len(last_n_tokens_data)
         n_vocab = self._n_vocab
         n_ctx = self._n_ctx
         top_k = n_vocab if top_k <= 0 else top_k
         last_n_tokens_size = n_ctx if last_n_tokens_size < 0 else last_n_tokens_size
+        last_n_tokens_data_c = (llama_cpp.llama_token * last_n_tokens_size)(
+            *last_n_tokens_data
+        )
         logits: npt.NDArray[np.single] = self._scores[-1, :]
 
         if logits_processor is not None:
             logits[:] = logits_processor(self._input_ids, logits)
 
         nl_logit = logits[self._token_nl]
-        candidates = self._candidates
-        candidates_data = self._candidates_data
-        candidates_data["id"][:] = self._candidates_data_id  # type: ignore
-        candidates_data["logit"][:] = logits
-        candidates_data["p"][:] = self._candidates_data_p  # type: ignore
-        candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p)
-        candidates.sorted = llama_cpp.c_bool(False)
-        candidates.size = llama_cpp.c_size_t(n_vocab)
-        llama_cpp.llama_sample_repetition_penalties(
-            ctx=self.ctx,
-            candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
-            last_tokens_data=last_n_tokens_data,
+        self._candidates.copy_logits(logits)
+        self._ctx.sample_repetition_penalties(
+            candidates=self._candidates,
+            last_tokens_data=last_n_tokens_data_c,
             penalty_last_n=last_n_tokens_size,
             penalty_repeat=repeat_penalty,
             penalty_freq=frequency_penalty,
             penalty_present=presence_penalty,
         )
         if not penalize_nl:
-            candidates.data[self._token_nl].logit = llama_cpp.c_float(nl_logit)
+            self._candidates.candidates.data[self._token_nl].logit = llama_cpp.c_float(
+                nl_logit
+            )
 
         if grammar is not None:
-            llama_cpp.llama_sample_grammar(
-                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
-                grammar=grammar.grammar,
+            self._ctx.sample_grammar(
+                candidates=self._candidates,
+                grammar=grammar,
             )
 
         if temp == 0.0:
-            id = llama_cpp.llama_sample_token_greedy(
-                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
-            )
+            id = self._ctx.sample_token_greedy(candidates=self._candidates)
         elif mirostat_mode == 1:
-            mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau)
-            mirostat_m = llama_cpp.c_int(100)
-            llama_cpp.llama_sample_temperature(
-                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
-                temp=temp,
-            )
-            id = llama_cpp.llama_sample_token_mirostat(
-                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+            self._ctx.sample_temp(candidates=self._candidates, temp=temp)
+            id = self._ctx.sample_token_mirostat(
+                candidates=self._candidates,
                 tau=mirostat_tau,
                 eta=mirostat_eta,
-                mu=llama_cpp.ctypes.byref(mirostat_mu),  # type: ignore
-                m=mirostat_m,
+                mu=2.0 * mirostat_tau,
+                m=100,
             )
         elif mirostat_mode == 2:
-            mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau)
-            llama_cpp.llama_sample_temperature(
-                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
-                temp=temp,
-            )
-            id = llama_cpp.llama_sample_token_mirostat_v2(
-                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+            self._ctx.sample_temp(candidates=self._candidates, temp=temp)
+            id = self._ctx.sample_token_mirostat_v2(
+                candidates=self._candidates,
                 tau=mirostat_tau,
                 eta=mirostat_eta,
-                mu=llama_cpp.ctypes.byref(mirostat_mu),  # type: ignore
+                mu=2.0 * mirostat_tau,
             )
         else:
-            llama_cpp.llama_sample_top_k(
-                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
-                k=top_k,
-                min_keep=llama_cpp.c_size_t(1),
-            )
-            llama_cpp.llama_sample_tail_free(
-                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
-                z=tfs_z,
-                min_keep=llama_cpp.c_size_t(1),
-            )
-            llama_cpp.llama_sample_typical(
-                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
-                p=llama_cpp.c_float(1.0),
-                min_keep=llama_cpp.c_size_t(1),
-            )
-            llama_cpp.llama_sample_top_p(
-                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
-                p=top_p,
-                min_keep=llama_cpp.c_size_t(1),
-            )
-            llama_cpp.llama_sample_temperature(
-                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
-                temp=temp,
-            )
-            id = llama_cpp.llama_sample_token(
-                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
-            )
+            self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1)
+            self._ctx.sample_tail_free(candidates=self._candidates, z=tfs_z, min_keep=1)
+            self._ctx.sample_typical(candidates=self._candidates, p=1.0, min_keep=1)
+            self._ctx.sample_top_p(candidates=self._candidates, p=top_p, min_keep=1)
+            self._ctx.sample_temp(candidates=self._candidates, temp=temp)
+            id = self._ctx.sample_token(candidates=self._candidates)
         if grammar is not None:
-            llama_cpp.llama_grammar_accept_token(
-                ctx=self.ctx,
-                grammar=grammar.grammar,
-                token=llama_cpp.ctypes.c_int(id),
-            )
+            self._ctx.grammar_accept_token(grammar=grammar, token=id)
         return id
 
-    def sample(
-        self,
-        top_k: int = 40,
-        top_p: float = 0.95,
-        temp: float = 0.80,
-        repeat_penalty: float = 1.1,
-        frequency_penalty: float = 0.0,
-        presence_penalty: float = 0.0,
-        tfs_z: float = 1.0,
-        mirostat_mode: int = 0,
-        mirostat_eta: float = 0.1,
-        mirostat_tau: float = 5.0,
-        penalize_nl: bool = True,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        grammar: Optional[LlamaGrammar] = None,
-    ):
-        """Sample a token from the model.
-
-        Args:
-            top_k: The top-k sampling parameter.
-            top_p: The top-p sampling parameter.
-            temp: The temperature parameter.
-            repeat_penalty: The repeat penalty parameter.
-
-        Returns:
-            The sampled token.
-        """
-        assert self.ctx is not None
-        last_n_tokens_data = [llama_cpp.llama_token(0)] * max(
-            0, self.last_n_tokens_size - len(self._input_ids)
-        ) + self._input_ids[-self.last_n_tokens_size :].tolist()
-        return self._sample(
-            last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)(
-                *last_n_tokens_data
-            ),
-            last_n_tokens_size=self.last_n_tokens_size,
-            top_k=top_k,
-            top_p=top_p,
-            temp=temp,
-            tfs_z=tfs_z,
-            repeat_penalty=repeat_penalty,
-            frequency_penalty=frequency_penalty,
-            presence_penalty=presence_penalty,
-            mirostat_mode=mirostat_mode,
-            mirostat_tau=mirostat_tau,
-            mirostat_eta=mirostat_eta,
-            penalize_nl=penalize_nl,
-            logits_processor=logits_processor,
-            grammar=grammar,
-        )
-
     def generate(
         self,
         tokens: Sequence[int],
@@ -805,8 +1158,7 @@ def generate(
         Yields:
             The generated tokens.
         """
-        assert self.ctx is not None
-        if reset and len(self._input_ids) > 0:
+        if reset and self.n_tokens > 0:
             longest_prefix = 0
             for a, b in zip(self._input_ids, tokens[:-1]):
                 if a == b:
@@ -862,8 +1214,8 @@ def create_embedding(
         Returns:
             An embedding object.
         """
-        assert self.ctx is not None
-        assert self.model is not None
+        assert self._ctx.ctx is not None
+        assert self._model.model is not None
         model_name: str = model if model is not None else self.model_path
 
         if self.context_params.embedding == False:
@@ -872,7 +1224,7 @@ def create_embedding(
             )
 
         if self.verbose:
-            llama_cpp.llama_reset_timings(self.ctx)
+            llama_cpp.llama_reset_timings(self._ctx.ctx)
 
         if isinstance(input, str):
             inputs = [input]
@@ -887,8 +1239,8 @@ def create_embedding(
             self.eval(tokens)
             n_tokens = len(tokens)
             total_tokens += n_tokens
-            embedding = llama_cpp.llama_get_embeddings(self.ctx)[
-                : llama_cpp.llama_n_embd(self.model)
+            embedding = llama_cpp.llama_get_embeddings(self._ctx.ctx)[
+                : llama_cpp.llama_n_embd(self._model.model)
             ]
 
             data.append(
@@ -899,7 +1251,7 @@ def create_embedding(
                 }
             )
         if self.verbose:
-            llama_cpp.llama_print_timings(self.ctx)
+            llama_cpp.llama_print_timings(self._ctx.ctx)
 
         return {
             "object": "list",
@@ -946,7 +1298,7 @@ def _create_completion(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
     ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
-        assert self.ctx is not None
+        assert self._ctx is not None
         assert suffix is None or suffix.__class__ is str
 
         completion_id: str = f"cmpl-{str(uuid.uuid4())}"
@@ -966,16 +1318,16 @@ def _create_completion(
         model_name: str = model if model is not None else self.model_path
 
         if self.verbose:
-            llama_cpp.llama_reset_timings(self.ctx)
+            self._ctx.reset_timings()
 
-        if len(prompt_tokens) >= llama_cpp.llama_n_ctx(self.ctx):
+        if len(prompt_tokens) >= self._n_ctx:
             raise ValueError(
-                f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
+                f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self._ctx)}"
             )
 
         if max_tokens <= 0:
             # Unlimited, depending on n_ctx.
-            max_tokens = llama_cpp.llama_n_ctx(self.ctx) - len(prompt_tokens)
+            max_tokens = self._n_ctx - len(prompt_tokens)
 
         # Truncate max_tokens if requested tokens would exceed the context window
         max_tokens = (
@@ -1186,7 +1538,7 @@ def _create_completion(
             finish_reason = "stop"
 
         if self.verbose:
-            llama_cpp.llama_print_timings(self.ctx)
+            self._ctx.print_timings()
 
         if stream:
             remaining_tokens = completion_tokens[returned_tokens:]
@@ -1584,24 +1936,6 @@ def create_chat_completion(
             grammar=grammar,
         )
 
-    def _free_model(self, *, _lbatch_free=llama_cpp._lib.llama_batch_free, _lfree_model=llama_cpp._lib.llama_free_model, _free=llama_cpp._lib.llama_free):
-        batch = getattr(self, 'batch', None)
-        if batch is not None:
-            _lbatch_free(batch)
-            self.batch = None
-        model = getattr(self, 'model', None)
-        if model is not None:
-            _lfree_model(model)
-            self.model = None
-        ctx = getattr(self, 'ctx', None)
-        if ctx is not None:
-            _free(ctx)
-            self.ctx = None
-
-    def __del__(self):
-        with suppress_stdout_stderr(disable=self.verbose):
-            self._free_model()
-
     def __getstate__(self):
         return dict(
             model_path=self.model_path,
@@ -1686,16 +2020,16 @@ def __setstate__(self, state):
         )
 
     def save_state(self) -> LlamaState:
-        assert self.ctx is not None
+        assert self._ctx.ctx is not None
         if self.verbose:
             print("Llama.save_state: saving llama state", file=sys.stderr)
-        state_size = llama_cpp.llama_get_state_size(self.ctx)
+        state_size = llama_cpp.llama_get_state_size(self._ctx.ctx)
         if self.verbose:
             print(f"Llama.save_state: got state size: {state_size}", file=sys.stderr)
         llama_state = (llama_cpp.c_uint8 * int(state_size))()
         if self.verbose:
             print("Llama.save_state: allocated state", file=sys.stderr)
-        n_bytes = llama_cpp.llama_copy_state_data(self.ctx, llama_state)
+        n_bytes = llama_cpp.llama_copy_state_data(self._ctx.ctx, llama_state)
         if self.verbose:
             print(f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr)
         if int(n_bytes) > int(state_size):
@@ -1716,7 +2050,7 @@ def save_state(self) -> LlamaState:
         )
 
     def load_state(self, state: LlamaState) -> None:
-        assert self.ctx is not None
+        assert self._ctx.ctx is not None
         self.scores = state.scores.copy()
         self.input_ids = state.input_ids.copy()
         self.n_tokens = state.n_tokens
@@ -1724,43 +2058,36 @@ def load_state(self, state: LlamaState) -> None:
         LLamaStateArrayType = llama_cpp.c_uint8 * state_size
         llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state)
 
-        if llama_cpp.llama_set_state_data(self.ctx, llama_state) != state_size:
+        if llama_cpp.llama_set_state_data(self._ctx.ctx, llama_state) != state_size:
             raise RuntimeError("Failed to set llama state data")
 
     def n_ctx(self) -> int:
         """Return the context window size."""
-        assert self.ctx is not None
-        return llama_cpp.llama_n_ctx(self.ctx)
+        return self._ctx.n_ctx()
 
     def n_embd(self) -> int:
         """Return the embedding size."""
-        assert self.model is not None
-        return llama_cpp.llama_n_embd(self.model)
+        return self._model.n_embd()
 
     def n_vocab(self) -> int:
         """Return the vocabulary size."""
-        assert self.model is not None
-        return llama_cpp.llama_n_vocab(self.model)
+        return self._model.n_vocab()
 
     def tokenizer(self) -> "LlamaTokenizer":
         """Return the tokenizer for this model."""
-        assert self.ctx is not None
         return LlamaTokenizer(self)
 
     def token_eos(self) -> int:
         """Return the end-of-sequence token."""
-        assert self.model is not None
-        return llama_cpp.llama_token_eos(self.model)
+        return self._model.token_eos()
 
     def token_bos(self) -> int:
         """Return the beginning-of-sequence token."""
-        assert self.model is not None
-        return llama_cpp.llama_token_bos(self.model)
+        return self._model.token_bos()
 
     def token_nl(self) -> int:
         """Return the newline token."""
-        assert self.model is not None
-        return llama_cpp.llama_token_nl(self.model)
+        return self._model.token_nl()
 
     @staticmethod
     def logits_to_logprobs(logits: List[float]) -> List[float]:
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 8e683d113..a4d21004f 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -83,8 +83,6 @@ def _load_shared_library(lib_base_name: str):
 
 # Misc
 c_float_p = POINTER(c_float)
-c_float_p_p = POINTER(POINTER(c_float))
-c_int_p = POINTER(c_int)
 c_uint8_p = POINTER(c_uint8)
 c_size_t_p = POINTER(c_size_t)
 
@@ -115,11 +113,6 @@ def _load_shared_library(lib_base_name: str):
 # struct llama_context;
 llama_context_p = c_void_p
 
-# struct clip_ctx;
-clip_ctx_p = c_void_p
-
-# struct llava_image_embed;
-llava_image_embed_p = c_void_p;
 
 # typedef int32_t llama_pos;
 llama_pos = c_int32
@@ -1085,7 +1078,7 @@ def llama_batch_get_one(
     tokens,  # type: Array[llama_token]
     n_tokens: Union[c_int, int],
     pos_0: Union[llama_pos, int],
-    seq_id: Union[llama_seq_id, int],
+    seq_id: llama_seq_id,
 ) -> llama_batch:
     return _lib.llama_batch_get_one(tokens, n_tokens, pos_0, seq_id)
 
@@ -1969,63 +1962,3 @@ def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p):
 
 _lib.llama_dump_timing_info_yaml.argtypes = [ctypes.c_void_p, llama_context_p]
 _lib.llama_dump_timing_info_yaml.restype = None
-
-
-# LLAVA
-
-
-# LLAMA_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
-def clip_model_load(fname: Union[c_char_p, bytes], verbosity: c_int = 0) -> clip_ctx_p:
-    """ load mmproj model """
-    return _lib.clip_model_load(fname, verbosity)
-_lib.clip_model_load.argtypes = [c_char_p, c_int]
-_lib.clip_model_load.restype = clip_ctx_p
-
-
-# LLAMA_API void clip_free(struct clip_ctx * ctx);
-def clip_free(ctx: clip_ctx_p):
-    """ free mmproj model """
-    _lib.clip_free(ctx)
-_lib.clip_free.argtypes = [clip_ctx_p]
-_lib.clip_free.restype = None
-
-
-#LLAMA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
-def llava_validate_embed_size(ctx_llama: llama_context_p, ctx_clip: clip_ctx_p) -> c_bool:
-    """ sanity check for clip <-> llava embed size match """
-    return _lib.llava_validate_embed_size(ctx_llama, ctx_clip)
-_lib.llava_validate_embed_size.argtypes = [llama_context_p, clip_ctx_p]
-_lib.llava_validate_embed_size.restype = c_bool
-
-
-#LLAMA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
-def llava_image_embed_make_with_bytes(ctx_clip: clip_ctx_p, n_threads: Union[int,c_int], image_bytes: c_uint8_p, image_bytes_length: c_size_t) -> llava_image_embed_p:
-    """ build an image embed by interpreting image_bytes as the contents of an image file with byte size image_bytes_length.
-     supported formats (autodetected): JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC (ref https://github.com/nothings/stb) """
-    return _lib.llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length)
-_lib.llava_image_embed_make_with_bytes.argtypes = [clip_ctx_p, c_int, c_uint8_p, c_size_t]
-_lib.llava_image_embed_make_with_bytes.restype = llava_image_embed_p
-
-
-#LLAMA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
-def llava_image_embed_make_with_filename(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], filename: Union[c_char_p, bytes]) -> llava_image_embed_p:
-    """ build an image embed from a path to an image filename """
-    return _lib.llava_image_embed_make_with_filename(ctx_clip, n_threads, filename)
-_lib.llava_image_embed_make_with_filename.argtypes = [clip_ctx_p, c_int, c_char_p]
-_lib.llava_image_embed_make_with_filename.restype = llava_image_embed_p
-
-#LLAMA_API void llava_image_embed_free(struct llava_image_embed * embed);
-def llava_image_embed_free(embed: llava_image_embed_p):
-    """ free an embedding made with one of the llava_image_embed_make_ methods """
-    _lib.llava_image_embed_free(embed)
-_lib.llava_image_embed_free.argtypes = [llava_image_embed_p]
-_lib.llava_image_embed_free.restype = None
-
-#LLAMA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
-def llava_eval_image_embed(ctx: llama_context_p, image_embed: llava_image_embed_p, n_batch: c_int, n_past: c_int_p) -> c_bool:
-    """ write the image represented by embed into the llama context with batch size n_batch,
-    starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed."""
-    return _lib.llava_eval_image_embed(ctx, image_embed, n_batch, n_past)
-_lib.llava_eval_image_embed.argtypes = [llama_context_p, llava_image_embed_p, c_int, c_int_p]
-_lib.llava_eval_image_embed.restyle = c_bool
-

From 82007d0b301f5432b8abcce0021d7f26de58cd4c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 6 Nov 2023 14:01:58 -0500
Subject: [PATCH 08/26] Update llava example

---
 examples/multimodal/llava.py | 60 ++++++++++++++++++++++++------------
 llama_cpp/llava_cpp.py       |  2 +-
 2 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/examples/multimodal/llava.py b/examples/multimodal/llava.py
index a209625c1..ac5c33a73 100644
--- a/examples/multimodal/llava.py
+++ b/examples/multimodal/llava.py
@@ -1,69 +1,90 @@
 import ctypes
-import json
 import argparse
 import os
 import array
 import sys
 
-from llama_cpp import (Llama, clip_model_load, llava_image_embed_make_with_filename, llava_image_embed_make_with_bytes,
-    llava_image_embed_p, llava_image_embed_free, llava_validate_embed_size, llava_eval_image_embed)
+from llama_cpp import Llama
+from llama_cpp.llava_cpp import (
+    clip_model_load,
+    llava_image_embed_make_with_filename,
+    llava_image_embed_make_with_bytes,
+    llava_image_embed_free,
+    llava_validate_embed_size,
+    llava_eval_image_embed,
+)
 
 parser = argparse.ArgumentParser()
-parser.add_argument("-m", "--model", type=str, default="../models/llava-v1.5-7b/ggml-model-q5_k.gguf")
+parser.add_argument(
+    "-m", "--model", type=str, default="../models/llava-v1.5-7b/ggml-model-q5_k.gguf"
+)
 parser.add_argument("--mmproj", type=str, default="llava-v1.5-7b/mmproj-model-f16.gguf")
 parser.add_argument("-t", "--temp", type=float, default=0.1)
-parser.add_argument("-p", "--prompt", type=str, default="Describe this image in detail.")
+parser.add_argument(
+    "-p", "--prompt", type=str, default="Describe this image in detail."
+)
 args = parser.parse_args()
 
 print(f"loading clip model from {args.mmproj}")
 if not os.path.exists(args.mmproj):
     raise FileNotFoundError(args.mmproj)
-ctx_clip = clip_model_load(args.mmproj.encode('utf-8'))
+ctx_clip = clip_model_load(fname=args.mmproj.encode("utf-8"), verbosity=0)
 
 image_path = os.path.join(os.path.dirname(__file__), "overfitting_lc.png")
 if not os.path.exists(image_path):
     raise FileNotFoundError(image_path)
-image_embed = llava_image_embed_make_with_filename(ctx_clip=ctx_clip, n_threads=1, filename=image_path.encode('utf8'))
+image_embed = llava_image_embed_make_with_filename(
+    ctx_clip=ctx_clip, n_threads=1, image_path=image_path.encode("utf8")
+)
 
-def load_image_embed_from_file_bytes(image_path: str) -> llava_image_embed_p:
-    with open(image_path, 'rb') as file:
+
+def load_image_embed_from_file_bytes(image_path: str):
+    with open(image_path, "rb") as file:
         image_bytes = file.read()
         bytes_length = len(image_bytes)
-        data_array = array.array('B', image_bytes)
+        data_array = array.array("B", image_bytes)
         c_ubyte_ptr = (ctypes.c_ubyte * len(data_array)).from_buffer(data_array)
-        return llava_image_embed_make_with_bytes(ctx_clip=ctx_clip, n_threads=1, image_bytes=c_ubyte_ptr, image_bytes_length=bytes_length)
+        return llava_image_embed_make_with_bytes(
+            ctx_clip=ctx_clip,
+            n_threads=1,
+            image_bytes=c_ubyte_ptr,
+            image_bytes_length=bytes_length,
+        )
+
 
 print(f"loading llm model from {args.model}")
 if not os.path.exists(args.model):
     raise FileNotFoundError(args.model)
-llm = Llama(model_path=args.model, n_ctx=2048, n_gpu_layers=1) # longer context needed for image embeds
+llm = Llama(
+    model_path=args.model, n_ctx=2048, n_gpu_layers=1
+)  # longer context needed for image embeds
 
 if not llava_validate_embed_size(llm.ctx, ctx_clip):
     raise RuntimeError("llm and mmproj model embed size mismatch")
 
 # eval system prompt
 system_prompt = "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\n"
-llm.eval(llm.tokenize(system_prompt.encode('utf8')))
-llm.eval(llm.tokenize("\nUSER: ".encode('utf8')))
+llm.eval(llm.tokenize(system_prompt.encode("utf8")))
+llm.eval(llm.tokenize("\nUSER: ".encode("utf8")))
 
 # eval image embed
 n_past = ctypes.c_int(llm.n_tokens)
-n_past_p = ctypes.byref(n_past)
+n_past_p = ctypes.pointer(n_past)
 llava_eval_image_embed(llm.ctx, image_embed, llm.n_batch, n_past_p)
 llm.n_tokens = n_past.value
 llava_image_embed_free(image_embed)
 
 # eval prompt
-prompt = 'Describe the visual content of this image'
-llm.eval(llm.tokenize(prompt.encode('utf8')))
-llm.eval(llm.tokenize("\nASSISTANT:".encode('utf8')))
+prompt = "Describe the visual content of this image"
+llm.eval(llm.tokenize(prompt.encode("utf8")))
+llm.eval(llm.tokenize("\nASSISTANT:".encode("utf8")))
 
 # get output
 print("\n")
 max_target_len = 256
 for i in range(max_target_len):
     t_id = llm.sample(temp=0.1)
-    t = llm.detokenize([t_id]).decode('utf8')
+    t = llm.detokenize([t_id]).decode("utf8")
     if t == "</s>":
         break
     print(t, end="")
@@ -72,4 +93,3 @@ def load_image_embed_from_file_bytes(image_path: str) -> llava_image_embed_p:
 
 print("\n")
 print("done")
-
diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py
index 5dc4b4201..72f6a1211 100644
--- a/llama_cpp/llava_cpp.py
+++ b/llama_cpp/llava_cpp.py
@@ -134,7 +134,7 @@ def llava_image_embed_free(embed: "_Pointer[llava_image_embed]"):
 
 # /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
 # LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
-def llava_eval_image_embed(ctx_llama: llama_cpp.llama_context_p, embed: "_Pointer[llava_image_embed]", n_batch: Union[c_int, int], n_past: Union[c_int, int]) -> bool:
+def llava_eval_image_embed(ctx_llama: llama_cpp.llama_context_p, embed: "_Pointer[llava_image_embed]", n_batch: Union[c_int, int], n_past: "_Pointer[c_int]") -> bool:
     return _libllava.llava_eval_image_embed(ctx_llama, embed, n_batch, n_past)
 
 _libllava.llava_eval_image_embed.argtypes = [llama_cpp.llama_context_p, POINTER(llava_image_embed), c_int, POINTER(c_int)]

From f6fe6b001cb96beb149ba74e0d8e94422313f374 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 6 Nov 2023 14:46:45 -0500
Subject: [PATCH 09/26] Add types for new gpt-4-vision-preview api

---
 llama_cpp/llama_types.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index a64033ea0..5d48cede3 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -77,9 +77,25 @@ class ChatCompletionFunctionCall(TypedDict):
     arguments: str
 
 
+class _ChatCompletionTextContent(TypedDict):
+    type: Literal["text"]
+    text: str
+
+
+class _ChatCompletionImageUrlContentUrl(TypedDict):
+    url: str
+
+
+class _ChatCompletionImageUrlContent(TypedDict):
+    type: Literal["image_url"]
+    image_url: _ChatCompletionImageUrlContentUrl
+
+
 class ChatCompletionResponseMessage(TypedDict):
     role: Literal["assistant", "user", "system", "function"]
-    content: Optional[str]
+    content: Optional[
+        Union[str, _ChatCompletionTextContent, _ChatCompletionImageUrlContent]
+    ]
     user: NotRequired[str]
     function_call: NotRequired[ChatCompletionFunctionCall]
 

From 39e2be13c3086cc202816bd8e093568991d004cc Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 6 Nov 2023 14:47:54 -0500
Subject: [PATCH 10/26] Fix typo

---
 llama_cpp/llama_types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index 5d48cede3..9d2848552 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -1,4 +1,4 @@
-"""Types and request signatrues for OpenAI compatibility
+"""Types and request signatures for OpenAI compatibility
 
 Based on the OpenAI OpenAPI specification:
 https://github.com/openai/openai-openapi/blob/master/openapi.yaml

From 7c3009ed5fe723cfc1d7cbc79f604728342d262a Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 6 Nov 2023 19:05:55 -0500
Subject: [PATCH 11/26] Update llama.cpp

---
 .gitmodules      | 2 +-
 vendor/llama.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 6fe937b38..7edf0975d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "vendor/llama.cpp"]
 	path = vendor/llama.cpp
-	url = https://github.com/damian0815/llama.cpp.git
+	url = https://github.com/ggerganov/llama.cpp.git
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 22f43fca0..381efbf48 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 22f43fca0ac2237766f825a8ab4aa2d5e19238d0
+Subproject commit 381efbf480959bb6d1e247a8b0c2328f22e350f8

From 1f1abfdea8d4061a55da84fe667d59e510a8b45f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 7 Nov 2023 00:09:25 -0500
Subject: [PATCH 12/26] Update llama_types to match OpenAI v1 API

---
 llama_cpp/llama.py             |  10 +--
 llama_cpp/llama_chat_format.py |  46 +++++++++--
 llama_cpp/llama_types.py       | 138 +++++++++++++++++++++++----------
 llama_cpp/server/app.py        |   4 +-
 4 files changed, 145 insertions(+), 53 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 6dc113ac9..b4242ea04 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1231,7 +1231,7 @@ def create_embedding(
         else:
             inputs = input
 
-        data: List[EmbeddingData] = []
+        data: List[Embedding] = []
         total_tokens = 0
         for index, input in enumerate(inputs):
             tokens = self.tokenize(input.encode("utf-8"), special=True)
@@ -1297,7 +1297,7 @@ def _create_completion(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
-    ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
+    ) -> Union[Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]]:
         assert self._ctx is not None
         assert suffix is None or suffix.__class__ is str
 
@@ -1753,7 +1753,7 @@ def create_completion(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
-    ) -> Union[Completion, Iterator[CompletionChunk]]:
+    ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
         """Generate text from a prompt.
 
         Args:
@@ -1800,7 +1800,7 @@ def create_completion(
             grammar=grammar,
         )
         if stream:
-            chunks: Iterator[CompletionChunk] = completion_or_chunks
+            chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks
             return chunks
         completion: Completion = next(completion_or_chunks)  # type: ignore
         return completion
@@ -1828,7 +1828,7 @@ def __call__(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
-    ) -> Union[Completion, Iterator[CompletionChunk]]:
+    ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
         """Generate text from a prompt.
 
         Args:
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 903a8c908..30c505a53 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -199,7 +199,7 @@ def _convert_text_completion_to_chat(
 
 
 def _convert_text_completion_chunks_to_chat(
-    chunks: Iterator[llama_types.CompletionChunk],
+    chunks: Iterator[llama_types.CreateCompletionStreamResponse],
 ) -> Iterator[llama_types.ChatCompletionChunk]:
     for i, chunk in enumerate(chunks):
         if i == 0:
@@ -239,12 +239,12 @@ def _convert_text_completion_chunks_to_chat(
 
 def _convert_completion_to_chat(
     completion_or_chunks: Union[
-        llama_types.Completion, Iterator[llama_types.CompletionChunk]
+        llama_types.CreateCompletionResponse, Iterator[llama_types.CreateCompletionStreamResponse]
     ],
     stream: bool = False,
-) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
+) -> Union[llama_types.CreateChatCompletionResponse, Iterator[llama_types.ChatCompletionChunk]]:
     if stream:
-        chunks: Iterator[llama_types.CompletionChunk] = completion_or_chunks  # type: ignore
+        chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks  # type: ignore
         return _convert_text_completion_chunks_to_chat(chunks)
     else:
         completion: llama_types.Completion = completion_or_chunks  # type: ignore
@@ -613,13 +613,13 @@ def prepare_messages_for_inference(
         all_messages: List[llama_types.ChatCompletionRequestMessage] = []
         if functions is not None:
             all_messages.append(
-                llama_types.ChatCompletionRequestMessage(
+                llama_types.ChatCompletionRequestSystemMessage(
                     role="system", content=generate_schema_from_functions(functions)
                 )
             )
 
         all_messages.append(
-            llama_types.ChatCompletionRequestMessage(
+            llama_types.ChatCompletionRequestSystemMessage(
                 role="system", content=SYSTEM_MESSAGE
             )
         )
@@ -636,7 +636,7 @@ def prepare_messages_for_inference(
             all_messages.append(message)
 
         all_messages.append(
-            llama_types.ChatCompletionRequestMessage(role="assistant", content=None)
+            llama_types.ChatCompletionRequestAssistantMessage(role="assistant", content=None)
         )
 
         def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
@@ -734,3 +734,35 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
         ],
         usage=completion["usage"],
     )
+
+
+@register_chat_completion_handler("llava-1.5")
+def lava_1_5_chat_handler(
+    llama: llama.Llama,
+    messages: List[llama_types.ChatCompletionRequestMessage],
+    functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
+    function_call: Optional[Union[str, llama_types.ChatCompletionFunctionCall]] = None,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+    top_k: int = 40,
+    stream: bool = False,
+    stop: Optional[Union[str, List[str]]] = [],
+    max_tokens: int = 256,
+    presence_penalty: float = 0.0,
+    frequency_penalty: float = 0.0,
+    repeat_penalty: float = 1.1,
+    tfs_z: float = 1.0,
+    mirostat_mode: int = 0,
+    mirostat_tau: float = 5.0,
+    mirostat_eta: float = 0.1,
+    model: Optional[str] = None,
+    logits_processor: Optional[llama.LogitsProcessorList] = None,
+    grammar: Optional[llama.LlamaGrammar] = None,
+) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
+    # convert messages into a list of strings and images objects
+    # for each item in list
+    #  if string, process it and append to prompt
+    #  if image, evaluate it and add empty string to prompt (for now)
+    # generate completion
+    items = []
+    current_prompt = ""
\ No newline at end of file
diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index 9d2848552..43c4f081e 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -1,5 +1,7 @@
 """Types and request signatures for OpenAI compatibility
 
+NOTE: These types may change to match the OpenAI OpenAPI specification.
+
 Based on the OpenAI OpenAPI specification:
 https://github.com/openai/openai-openapi/blob/master/openapi.yaml
 
@@ -19,9 +21,6 @@ class Embedding(TypedDict):
     embedding: List[float]
 
 
-EmbeddingData = Embedding
-
-
 class CreateEmbeddingResponse(TypedDict):
     object: Literal["list"]
     model: str
@@ -57,9 +56,6 @@ class CreateCompletionStreamResponse(TypedDict):
     choices: List[CompletionChoice]
 
 
-CompletionChunk = CreateCompletionStreamResponse
-
-
 class CreateCompletionResponse(TypedDict):
     id: str
     object: Literal["text_completion"]
@@ -69,9 +65,6 @@ class CreateCompletionResponse(TypedDict):
     usage: CompletionUsage
 
 
-Completion = CreateCompletionResponse
-
-
 class ChatCompletionFunctionCall(TypedDict):
     name: str
     arguments: str
@@ -100,73 +93,58 @@ class ChatCompletionResponseMessage(TypedDict):
     function_call: NotRequired[ChatCompletionFunctionCall]
 
 
-ChatCompletionMessage = ChatCompletionResponseMessage
-
-
 class ChatCompletionResponseFunction(TypedDict):
     name: str
     description: NotRequired[str]
     parameters: Dict[str, Any]  # TODO: make this more specific
 
 
-ChatCompletionFunction = ChatCompletionResponseFunction
-
-
 class ChatCompletionResponseChoice(TypedDict):
     index: int
-    message: ChatCompletionMessage
+    message: "ChatCompletionMessage"
     finish_reason: Optional[str]
 
 
-ChatCompletionChoice = ChatCompletionResponseChoice
-
-
 class CreateChatCompletionResponse(TypedDict):
     id: str
     object: Literal["chat.completion"]
     created: int
     model: str
-    choices: List[ChatCompletionChoice]
+    choices: List["ChatCompletionChoice"]
     usage: CompletionUsage
 
 
-ChatCompletion = CreateChatCompletionResponse
+class ChatCompletionMessageToolCallChunk(TypedDict):
+    index: int
+    id: NotRequired[str]
+    type: Literal["function"]
+    function: ChatCompletionFunctionCall
 
 
 class ChatCompletionStreamResponseDeltaEmpty(TypedDict):
     pass
 
 
-ChatCompletionChunkDeltaEmpty = ChatCompletionStreamResponseDeltaEmpty
-
-
 class ChatCompletionStreamResponseDelta(TypedDict):
-    role: NotRequired[Literal["assistant"]]
     content: NotRequired[str]
     function_call: NotRequired[ChatCompletionFunctionCall]
-
-
-ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta
+    tool_calls: NotRequired[List[ChatCompletionMessageToolCallChunk]]
+    role: NotRequired[Literal["system", "user", "assistant", "tool"]]
 
 
 class ChatCompletionStreamResponseChoice(TypedDict):
     index: int
-    delta: Union[ChatCompletionChunkDelta, ChatCompletionChunkDeltaEmpty]
+    delta: Union["ChatCompletionChunkDelta", "ChatCompletionChunkDeltaEmpty"]
     finish_reason: Optional[Literal["stop", "length", "function_call"]]
 
 
-ChatCompletionChunkChoice = ChatCompletionStreamResponseChoice
-
-
 class ChatCompletionStreamResponse(TypedDict):
     id: str
     model: str
     object: Literal["chat.completion.chunk"]
     created: int
-    choices: List[ChatCompletionChunkChoice]
-
+    choices: List["ChatCompletionChunkChoice"]
 
-ChatCompletionChunk = ChatCompletionStreamResponse
 
 JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]]
 
@@ -181,8 +159,90 @@ class ChatCompletionFunctionCallOption(TypedDict):
     name: str
 
 
-class ChatCompletionRequestMessage(TypedDict):
-    role: Literal["assistant", "user", "system", "function"]
+class ChatCompletionRequestMessageContentPartText(TypedDict):
+    type: Literal["text"]
+    text: str
+
+
+class ChatCompletionRequestMessageContentPartImageImageUrl(TypedDict):
+    url: str
+    detail: NotRequired[Literal["auto", "low", "high"]]
+
+
+class ChatCompletionRequestMessageContentPartImage(TypedDict):
+    type: Literal["image_url"]
+    image_url: ChatCompletionRequestMessageContentPartImageImageUrl
+
+
+ChatCompletionRequestMessageContentPart = Union[
+    ChatCompletionRequestMessageContentPartText,
+    ChatCompletionRequestMessageContentPartImage,
+]
+
+
+class ChatCompletionRequestSystemMessage(TypedDict):
+    role: Literal["system"]
     content: Optional[str]
-    name: NotRequired[str]
-    function_call: NotRequired[ChatCompletionFunctionCall]
+
+
+class ChatCompletionRequestUserMessage(TypedDict):
+    role: Literal["user"]
+    content: Optional[Union[str, List[ChatCompletionRequestMessageContentPart]]]
+
+
+class ChatCompletionMessageToolCallFunction(TypedDict):
+    name: str
+    arguments: str
+
+
+class ChatCompletionMessageToolCall(TypedDict):
+    id: str
+    type: Literal["function"]
+    function: ChatCompletionMessageToolCallFunction
+
+
+ChatCompletionMessageToolCalls = List[ChatCompletionMessageToolCall]
+
+
+class ChatCompletionRequestAssistantMessage(TypedDict):
+    role: Literal["assistant"]
+    content: Optional[str]
+    tool_calls: NotRequired[ChatCompletionMessageToolCalls]
+    function_call: NotRequired[ChatCompletionFunctionCall]  # DEPRECATED
+
+
+class ChatCompletionRequestToolMessage(TypedDict):
+    role: Literal["tool"]
+    content: Optional[str]
+    tool_call_id: str
+
+
+class ChatCompletionRequestFunctionMessage(TypedDict):
+    role: Literal["function"]
+    content: Optional[str]
+    name: str
+
+
+ChatCompletionRequestMessage = Union[
+    ChatCompletionRequestSystemMessage,
+    ChatCompletionRequestUserMessage,
+    ChatCompletionRequestAssistantMessage,
+    ChatCompletionRequestUserMessage,
+    ChatCompletionRequestToolMessage,
+    ChatCompletionRequestFunctionMessage,
+]
+
+# NOTE: The following type names are not part of the OpenAI OpenAPI specification
+# and will be removed in a future major release.
+
+EmbeddingData = Embedding
+CompletionChunk = CreateCompletionStreamResponse
+Completion = CreateCompletionResponse
+ChatCompletionMessage = ChatCompletionResponseMessage
+ChatCompletionChoice = ChatCompletionResponseChoice
+ChatCompletion = CreateChatCompletionResponse
+ChatCompletionChunkDeltaEmpty = ChatCompletionStreamResponseDeltaEmpty
+ChatCompletionChunkChoice = ChatCompletionStreamResponseChoice
+ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta
+ChatCompletionChunk = ChatCompletionStreamResponse
+ChatCompletionFunction = ChatCompletionResponseFunction
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 93afc3ee9..afd6a055b 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -688,7 +688,7 @@ async def create_completion(
         kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
 
     iterator_or_completion: Union[
-        llama_cpp.Completion, Iterator[llama_cpp.CompletionChunk]
+        llama_cpp.CreateCompletionResponse, Iterator[llama_cpp.CreateCompletionStreamResponse]
     ] = await run_in_threadpool(llama, **kwargs)
 
     if isinstance(iterator_or_completion, Iterator):
@@ -697,7 +697,7 @@ async def create_completion(
 
         # If no exception was raised from first_response, we can assume that
         # the iterator is valid and we can use it to stream the response.
-        def iterator() -> Iterator[llama_cpp.CompletionChunk]:
+        def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
             yield first_response
             yield from iterator_or_completion
 

From 2a369f411fd083396a6917d21f7bfd19d56901b0 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 7 Nov 2023 00:59:29 -0500
Subject: [PATCH 13/26] Update ChatCompletionFunction type

---
 llama_cpp/llama_types.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index 43c4f081e..bff77a180 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -93,7 +93,7 @@ class ChatCompletionResponseMessage(TypedDict):
     function_call: NotRequired[ChatCompletionFunctionCall]
 
 
-class ChatCompletionResponseFunction(TypedDict):
+class ChatCompletionFunction(TypedDict):
     name: str
     description: NotRequired[str]
     parameters: Dict[str, Any]  # TODO: make this more specific
@@ -245,4 +245,4 @@ class ChatCompletionRequestFunctionMessage(TypedDict):
 ChatCompletionChunkChoice = ChatCompletionStreamResponseChoice
 ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta
 ChatCompletionChunk = ChatCompletionStreamResponse
-ChatCompletionFunction = ChatCompletionResponseFunction
+ChatCompletionResponseFunction = ChatCompletionFunction

From 2ea2adfa5f2d545e872b622f463f152f67038de5 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 7 Nov 2023 00:59:42 -0500
Subject: [PATCH 14/26] Reorder request parameters

---
 llama_cpp/server/app.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index afd6a055b..dd6169931 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -580,10 +580,6 @@ class CreateCompletionRequest(BaseModel):
     max_tokens: int = max_tokens_field
     temperature: float = temperature_field
     top_p: float = top_p_field
-    mirostat_mode: int = mirostat_mode_field
-    mirostat_tau: float = mirostat_tau_field
-    mirostat_eta: float = mirostat_eta_field
-    grammar: Optional[str] = None
     echo: bool = Field(
         default=False,
         description="Whether to echo the prompt in the generated text. Useful for chatbots.",
@@ -610,6 +606,10 @@ class CreateCompletionRequest(BaseModel):
     top_k: int = top_k_field
     repeat_penalty: float = repeat_penalty_field
     logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
+    mirostat_mode: int = mirostat_mode_field
+    mirostat_tau: float = mirostat_tau_field
+    mirostat_eta: float = mirostat_eta_field
+    grammar: Optional[str] = None
 
     model_config = {
         "json_schema_extra": {
@@ -765,10 +765,6 @@ class CreateChatCompletionRequest(BaseModel):
     max_tokens: int = max_tokens_field
     temperature: float = temperature_field
     top_p: float = top_p_field
-    mirostat_mode: int = mirostat_mode_field
-    mirostat_tau: float = mirostat_tau_field
-    mirostat_eta: float = mirostat_eta_field
-    grammar: Optional[str] = None
     stop: Optional[List[str]] = stop_field
     stream: bool = stream_field
     presence_penalty: Optional[float] = presence_penalty_field
@@ -784,6 +780,10 @@ class CreateChatCompletionRequest(BaseModel):
     top_k: int = top_k_field
     repeat_penalty: float = repeat_penalty_field
     logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
+    mirostat_mode: int = mirostat_mode_field
+    mirostat_tau: float = mirostat_tau_field
+    mirostat_eta: float = mirostat_eta_field
+    grammar: Optional[str] = None
 
     model_config = {
         "json_schema_extra": {

From 87fc84bb965f59aea0bb76a56c93b3a5272e423d Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 7 Nov 2023 08:17:14 -0500
Subject: [PATCH 15/26] More API type fixes

---
 llama_cpp/llama.py             |   6 +-
 llama_cpp/llama_chat_format.py |  34 ++++++++---
 llama_cpp/llama_types.py       | 108 ++++++++++++++++++++++-----------
 llama_cpp/server/app.py        |  13 +++-
 4 files changed, 110 insertions(+), 51 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index b4242ea04..f09b8d58a 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1879,7 +1879,9 @@ def create_chat_completion(
         self,
         messages: List[ChatCompletionRequestMessage],
         functions: Optional[List[ChatCompletionFunction]] = None,
-        function_call: Optional[Union[str, ChatCompletionFunctionCall]] = None,
+        function_call: Optional[ChatCompletionRequestFunctionCall] = None,
+        tools: List[ChatCompletionTool] = [],
+        tool_choice: Optional[ChatCompletionToolChoiceOption] = None,
         temperature: float = 0.2,
         top_p: float = 0.95,
         top_k: int = 40,
@@ -1918,6 +1920,8 @@ def create_chat_completion(
             messages=messages,
             functions=functions,
             function_call=function_call,
+            tools=tools,
+            tool_choice=tool_choice,
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 30c505a53..103aa5ffd 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -11,12 +11,13 @@
 class LlamaChatCompletionHandler(Protocol):
     def __call__(
         self,
+        *,
         llama: llama.Llama,
         messages: List[llama_types.ChatCompletionRequestMessage],
         functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
-        function_call: Optional[
-            Union[str, llama_types.ChatCompletionFunctionCall]
-        ] = None,
+        function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
+        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+        tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
         temperature: float = 0.2,
         top_p: float = 0.95,
         top_k: int = 40,
@@ -33,6 +34,7 @@ def __call__(
         model: Optional[str] = None,
         logits_processor: Optional[llama.LogitsProcessorList] = None,
         grammar: Optional[llama.LlamaGrammar] = None,
+        **kwargs,  # type: ignore
     ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
         ...
 
@@ -239,10 +241,13 @@ def _convert_text_completion_chunks_to_chat(
 
 def _convert_completion_to_chat(
     completion_or_chunks: Union[
-        llama_types.CreateCompletionResponse, Iterator[llama_types.CreateCompletionStreamResponse]
+        llama_types.CreateCompletionResponse,
+        Iterator[llama_types.CreateCompletionStreamResponse],
     ],
     stream: bool = False,
-) -> Union[llama_types.CreateChatCompletionResponse, Iterator[llama_types.ChatCompletionChunk]]:
+) -> Union[
+    llama_types.CreateChatCompletionResponse, Iterator[llama_types.ChatCompletionChunk]
+]:
     if stream:
         chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks  # type: ignore
         return _convert_text_completion_chunks_to_chat(chunks)
@@ -329,7 +334,9 @@ def get_chat_format(name: str):
         )
 
 
-def hf_autotokenizer_to_chat_formatter(pretrained_model_name_or_path: Union[str, os.PathLike[str]]) -> ChatFormatter:
+def hf_autotokenizer_to_chat_formatter(
+    pretrained_model_name_or_path: Union[str, os.PathLike[str]]
+) -> ChatFormatter:
     # https://huggingface.co/docs/transformers/main/chat_templating
     # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format
     # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/blob/main/tokenizer_config.json
@@ -538,7 +545,7 @@ def functionary_chat_handler(
     llama: llama.Llama,
     messages: List[llama_types.ChatCompletionRequestMessage],
     functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
-    function_call: Optional[Union[str, llama_types.ChatCompletionFunctionCall]] = None,
+    function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
     temperature: float = 0.2,
     top_p: float = 0.95,
     top_k: int = 40,
@@ -555,6 +562,7 @@ def functionary_chat_handler(
     model: Optional[str] = None,
     logits_processor: Optional[llama.LogitsProcessorList] = None,
     grammar: Optional[llama.LlamaGrammar] = None,
+    **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
 
@@ -636,7 +644,9 @@ def prepare_messages_for_inference(
             all_messages.append(message)
 
         all_messages.append(
-            llama_types.ChatCompletionRequestAssistantMessage(role="assistant", content=None)
+            llama_types.ChatCompletionRequestAssistantMessage(
+                role="assistant", content=None
+            )
         )
 
         def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
@@ -713,6 +723,9 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
         prompt=new_prompt, stop=["user:", "</s>"], stream=False
     )  # type: ignore
 
+    assert "usage" in completion
+    assert isinstance(function_call, str)
+
     return llama_types.CreateChatCompletionResponse(
         id="chat" + completion["id"],
         object="chat.completion",
@@ -741,7 +754,7 @@ def lava_1_5_chat_handler(
     llama: llama.Llama,
     messages: List[llama_types.ChatCompletionRequestMessage],
     functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
-    function_call: Optional[Union[str, llama_types.ChatCompletionFunctionCall]] = None,
+    function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
     temperature: float = 0.2,
     top_p: float = 0.95,
     top_k: int = 40,
@@ -758,6 +771,7 @@ def lava_1_5_chat_handler(
     model: Optional[str] = None,
     logits_processor: Optional[llama.LogitsProcessorList] = None,
     grammar: Optional[llama.LlamaGrammar] = None,
+    **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     # convert messages into a list of strings and images objects
     # for each item in list
@@ -765,4 +779,4 @@ def lava_1_5_chat_handler(
     #  if image, evaluate it and add empty string to prompt (for now)
     # generate completion
     items = []
-    current_prompt = ""
\ No newline at end of file
+    current_prompt = ""
diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index bff77a180..cc4c1518f 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -48,49 +48,25 @@ class CompletionUsage(TypedDict):
     total_tokens: int
 
 
-class CreateCompletionStreamResponse(TypedDict):
-    id: str
-    object: Literal["text_completion"]
-    created: int
-    model: str
-    choices: List[CompletionChoice]
-
-
 class CreateCompletionResponse(TypedDict):
     id: str
     object: Literal["text_completion"]
     created: int
     model: str
     choices: List[CompletionChoice]
-    usage: CompletionUsage
+    usage: NotRequired[CompletionUsage]
 
 
-class ChatCompletionFunctionCall(TypedDict):
+class ChatCompletionResponseFunctionCall(TypedDict):
     name: str
     arguments: str
 
 
-class _ChatCompletionTextContent(TypedDict):
-    type: Literal["text"]
-    text: str
-
-
-class _ChatCompletionImageUrlContentUrl(TypedDict):
-    url: str
-
-
-class _ChatCompletionImageUrlContent(TypedDict):
-    type: Literal["image_url"]
-    image_url: _ChatCompletionImageUrlContentUrl
-
-
 class ChatCompletionResponseMessage(TypedDict):
-    role: Literal["assistant", "user", "system", "function"]
-    content: Optional[
-        Union[str, _ChatCompletionTextContent, _ChatCompletionImageUrlContent]
-    ]
-    user: NotRequired[str]
-    function_call: NotRequired[ChatCompletionFunctionCall]
+    content: Optional[str]
+    tool_calls: NotRequired["ChatCompletionMessageToolCalls"]
+    role: Literal["assistant", "function"] # NOTE: "function" may be incorrect here
+    function_call: NotRequired[ChatCompletionResponseFunctionCall]  # DEPRECATED
 
 
 class ChatCompletionFunction(TypedDict):
@@ -101,7 +77,7 @@ class ChatCompletionFunction(TypedDict):
 
 class ChatCompletionResponseChoice(TypedDict):
     index: int
-    message: "ChatCompletionMessage"
+    message: "ChatCompletionResponseMessage"
     finish_reason: Optional[str]
 
 
@@ -110,24 +86,36 @@ class CreateChatCompletionResponse(TypedDict):
     object: Literal["chat.completion"]
     created: int
     model: str
-    choices: List["ChatCompletionChoice"]
+    choices: List["ChatCompletionResponseChoice"]
     usage: CompletionUsage
 
 
+class ChatCompletionMessageToolCallChunkFunction(TypedDict):
+    name: str
+    arguments: str
+
+
 class ChatCompletionMessageToolCallChunk(TypedDict):
     index: int
     id: NotRequired[str]
     type: Literal["function"]
-    function: ChatCompletionFunctionCall
+    function: ChatCompletionMessageToolCallChunkFunction
 
 
 class ChatCompletionStreamResponseDeltaEmpty(TypedDict):
     pass
 
 
+class ChatCompletionStreamResponseDeltaFunctionCall(TypedDict):
+    name: str
+    arguments: str
+
+
 class ChatCompletionStreamResponseDelta(TypedDict):
     content: NotRequired[str]
-    function_call: NotRequired[ChatCompletionFunctionCall]
+    function_call: NotRequired[
+        ChatCompletionStreamResponseDeltaFunctionCall
+    ]  # DEPRECATED
     tool_calls: NotRequired[List[ChatCompletionMessageToolCallChunk]]
     role: NotRequired[Literal["system", "user", "assistant", "tool"]]
 
@@ -171,7 +159,7 @@ class ChatCompletionRequestMessageContentPartImageImageUrl(TypedDict):
 
 class ChatCompletionRequestMessageContentPartImage(TypedDict):
     type: Literal["image_url"]
-    image_url: ChatCompletionRequestMessageContentPartImageImageUrl
+    image_url: Union[str, ChatCompletionRequestMessageContentPartImageImageUrl]
 
 
 ChatCompletionRequestMessageContentPart = Union[
@@ -204,11 +192,18 @@ class ChatCompletionMessageToolCall(TypedDict):
 ChatCompletionMessageToolCalls = List[ChatCompletionMessageToolCall]
 
 
+class ChatCompletionRequestAssistantMessageFunctionCall(TypedDict):
+    name: str
+    arguments: str
+
+
 class ChatCompletionRequestAssistantMessage(TypedDict):
     role: Literal["assistant"]
     content: Optional[str]
     tool_calls: NotRequired[ChatCompletionMessageToolCalls]
-    function_call: NotRequired[ChatCompletionFunctionCall]  # DEPRECATED
+    function_call: NotRequired[
+        ChatCompletionRequestAssistantMessageFunctionCall
+    ]  # DEPRECATED
 
 
 class ChatCompletionRequestToolMessage(TypedDict):
@@ -232,12 +227,50 @@ class ChatCompletionRequestFunctionMessage(TypedDict):
     ChatCompletionRequestFunctionMessage,
 ]
 
+
+class ChatCompletionRequestFunctionCallOption:
+    name: str
+
+
+ChatCompletionRequestFunctionCall = Union[
+    Literal["none", "auto"], ChatCompletionRequestFunctionCallOption
+]
+
+ChatCompletionFunctionParameters = Dict[str, JsonType]
+
+
+class ChatCompletionToolFunction(TypedDict):
+    name: str
+    description: NotRequired[str]
+    parameters: ChatCompletionFunctionParameters
+
+
+class ChatCompletionTool(TypedDict):
+    type: Literal["function"]
+    function: ChatCompletionToolFunction
+
+
+class ChatCompletionNamedToolChoiceFunction(TypedDict):
+    name: str
+
+
+class ChatCompletionNamedToolChoice(TypedDict):
+    type: Literal["function"]
+    function: ChatCompletionNamedToolChoiceFunction
+
+
+ChatCompletionToolChoiceOption = Union[
+    Literal["none", "auto"], ChatCompletionNamedToolChoice
+]
+
+
 # NOTE: The following type names are not part of the OpenAI OpenAPI specification
 # and will be removed in a future major release.
 
 EmbeddingData = Embedding
-CompletionChunk = CreateCompletionStreamResponse
+CompletionChunk = CreateCompletionResponse
 Completion = CreateCompletionResponse
+CreateCompletionStreamResponse = CreateCompletionResponse
 ChatCompletionMessage = ChatCompletionResponseMessage
 ChatCompletionChoice = ChatCompletionResponseChoice
 ChatCompletion = CreateChatCompletionResponse
@@ -246,3 +279,4 @@ class ChatCompletionRequestFunctionMessage(TypedDict):
 ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta
 ChatCompletionChunk = ChatCompletionStreamResponse
 ChatCompletionResponseFunction = ChatCompletionFunction
+ChatCompletionFunctionCall = ChatCompletionResponseFunctionCall
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index dd6169931..261c58445 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -748,20 +748,27 @@ class ChatCompletionRequestMessage(BaseModel):
     )
     content: Optional[str] = Field(default="", description="The content of the message.")
 
-from typing import Any
 
 class CreateChatCompletionRequest(BaseModel):
-    messages: List[Any] = Field(
+    messages: List[llama_cpp.ChatCompletionRequestMessage] = Field(
         default=[], description="A list of messages to generate completions for."
     )
     functions: Optional[List[llama_cpp.ChatCompletionFunction]] = Field(
         default=None,
         description="A list of functions to apply to the generated completions.",
     )
-    function_call: Optional[Union[Literal["auto", "none"], llama_cpp.ChatCompletionFunctionCallOption]] = Field(
+    function_call: Optional[llama_cpp.ChatCompletionRequestFunctionCall] = Field(
         default=None,
         description="A function to apply to the generated completions.",
     )
+    tools: Optional[List[llama_cpp.ChatCompletionTool]] = Field(
+        default=None,
+        description="A list of tools to apply to the generated completions.",
+    )
+    tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field(
+        default=None,
+        description="A tool to apply to the generated completions.",
+    ) # TODO: verify
     max_tokens: int = max_tokens_field
     temperature: float = temperature_field
     top_p: float = top_p_field

From 5091b9c5645547f69e652c9091556e6ec5b0d064 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 7 Nov 2023 10:14:45 -0500
Subject: [PATCH 16/26] Even More Type Updates

---
 llama_cpp/llama.py             | 12 ++++--
 llama_cpp/llama_chat_format.py | 75 +++++++++++++++++++---------------
 llama_cpp/llama_types.py       | 13 +++---
 3 files changed, 59 insertions(+), 41 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index f09b8d58a..ebb90046e 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1297,7 +1297,9 @@ def _create_completion(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
-    ) -> Union[Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]]:
+    ) -> Union[
+        Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]
+    ]:
         assert self._ctx is not None
         assert suffix is None or suffix.__class__ is str
 
@@ -1880,7 +1882,7 @@ def create_chat_completion(
         messages: List[ChatCompletionRequestMessage],
         functions: Optional[List[ChatCompletionFunction]] = None,
         function_call: Optional[ChatCompletionRequestFunctionCall] = None,
-        tools: List[ChatCompletionTool] = [],
+        tools: Optional[List[ChatCompletionTool]] = None,
         tool_choice: Optional[ChatCompletionToolChoiceOption] = None,
         temperature: float = 0.2,
         top_p: float = 0.95,
@@ -1898,7 +1900,9 @@ def create_chat_completion(
         model: Optional[str] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
-    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+    ) -> Union[
+        CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
+    ]:
         """Generate a chat completion from a list of messages.
 
         Args:
@@ -1916,7 +1920,7 @@ def create_chat_completion(
         """
         handler = llama_chat_format.get_chat_completion_handler(self.chat_format)
         return handler(
-            self,
+            llama=self,
             messages=messages,
             functions=functions,
             function_call=function_call,
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 103aa5ffd..a10bc70cc 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -35,7 +35,7 @@ def __call__(
         logits_processor: Optional[llama.LogitsProcessorList] = None,
         grammar: Optional[llama.LlamaGrammar] = None,
         **kwargs,  # type: ignore
-    ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
+    ) -> Union[llama_types.CreateChatCompletionResponse, Iterator[llama_types.CreateChatCompletionStreamResponse]]:
         ...
 
 
@@ -749,34 +749,45 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
     )
 
 
-@register_chat_completion_handler("llava-1.5")
-def lava_1_5_chat_handler(
-    llama: llama.Llama,
-    messages: List[llama_types.ChatCompletionRequestMessage],
-    functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
-    function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
-    temperature: float = 0.2,
-    top_p: float = 0.95,
-    top_k: int = 40,
-    stream: bool = False,
-    stop: Optional[Union[str, List[str]]] = [],
-    max_tokens: int = 256,
-    presence_penalty: float = 0.0,
-    frequency_penalty: float = 0.0,
-    repeat_penalty: float = 1.1,
-    tfs_z: float = 1.0,
-    mirostat_mode: int = 0,
-    mirostat_tau: float = 5.0,
-    mirostat_eta: float = 0.1,
-    model: Optional[str] = None,
-    logits_processor: Optional[llama.LogitsProcessorList] = None,
-    grammar: Optional[llama.LlamaGrammar] = None,
-    **kwargs,  # type: ignore
-) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
-    # convert messages into a list of strings and images objects
-    # for each item in list
-    #  if string, process it and append to prompt
-    #  if image, evaluate it and add empty string to prompt (for now)
-    # generate completion
-    items = []
-    current_prompt = ""
+class Llava15ChatHandler:
+    def __init__(self, clip_model_path: str):
+        self.clip_model_path = clip_model_path
+
+    def chat_handler(
+        self,
+        llama: llama.Llama,
+        messages: List[llama_types.ChatCompletionRequestMessage],
+        functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
+        function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
+        temperature: float = 0.2,
+        top_p: float = 0.95,
+        top_k: int = 40,
+        stream: bool = False,
+        stop: Optional[Union[str, List[str]]] = [],
+        max_tokens: int = 256,
+        presence_penalty: float = 0.0,
+        frequency_penalty: float = 0.0,
+        repeat_penalty: float = 1.1,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_tau: float = 5.0,
+        mirostat_eta: float = 0.1,
+        model: Optional[str] = None,
+        logits_processor: Optional[llama.LogitsProcessorList] = None,
+        grammar: Optional[llama.LlamaGrammar] = None,
+        **kwargs,  # type: ignore
+    ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
+        # convert messages into a list of strings and images objects
+        # for each item in list
+        #  if string, process it and append to prompt
+        #  if image, evaluate it and add empty string to prompt (for now)
+        # generate completion
+        items = []
+        current_prompt = ""
+        system_prompt = ""
+        for message in messages:
+            if message["role"] == "system" and message["content"] is not None:
+                system_prompt = message["content"]
+            if message["role"] == "user":
+                items.append(message["content"])
+                current_prompt += message["content"]
\ No newline at end of file
diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index cc4c1518f..b49d8594c 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -65,7 +65,7 @@ class ChatCompletionResponseFunctionCall(TypedDict):
 class ChatCompletionResponseMessage(TypedDict):
     content: Optional[str]
     tool_calls: NotRequired["ChatCompletionMessageToolCalls"]
-    role: Literal["assistant", "function"] # NOTE: "function" may be incorrect here
+    role: Literal["assistant", "function"]  # NOTE: "function" may be incorrect here
     function_call: NotRequired[ChatCompletionResponseFunctionCall]  # DEPRECATED
 
 
@@ -122,16 +122,18 @@ class ChatCompletionStreamResponseDelta(TypedDict):
 
 class ChatCompletionStreamResponseChoice(TypedDict):
     index: int
-    delta: Union["ChatCompletionChunkDelta", "ChatCompletionChunkDeltaEmpty"]
+    delta: Union[
+        ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty
+    ]
     finish_reason: Optional[Literal["stop", "length", "function_call"]]
 
 
-class ChatCompletionStreamResponse(TypedDict):
+class CreateChatCompletionStreamResponse(TypedDict):
     id: str
     model: str
     object: Literal["chat.completion.chunk"]
     created: int
-    choices: List["ChatCompletionChunkChoice"]
+    choices: List[ChatCompletionStreamResponseChoice]
 
 
 JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]]
@@ -277,6 +279,7 @@ class ChatCompletionNamedToolChoice(TypedDict):
 ChatCompletionChunkDeltaEmpty = ChatCompletionStreamResponseDeltaEmpty
 ChatCompletionChunkChoice = ChatCompletionStreamResponseChoice
 ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta
-ChatCompletionChunk = ChatCompletionStreamResponse
+ChatCompletionChunk = CreateChatCompletionStreamResponse
+ChatCompletionStreamResponse = CreateChatCompletionStreamResponse
 ChatCompletionResponseFunction = ChatCompletionFunction
 ChatCompletionFunctionCall = ChatCompletionResponseFunctionCall

From 22a776d5936f7e62c62a9637a128acaf90ef2a32 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 7 Nov 2023 10:15:08 -0500
Subject: [PATCH 17/26] Add parameter for custom chat_handler to Llama class

---
 llama_cpp/llama.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index ebb90046e..9cf50fc27 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -752,6 +752,7 @@ def __init__(
         numa: bool = False,
         # Chat Format Params
         chat_format: str = "llama-2",
+        chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
         # Misc
         verbose: bool = True,
         # Extra Params
@@ -784,6 +785,7 @@ def __init__(
             lora_path: Path to a LoRA file to apply to the model.
             numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
             chat_format: String specifying the chat format to use when calling create_chat_completion.
+            chat_handler: Optional chat handler to use when calling create_chat_completion.
             verbose: Print verbose output to stderr.
 
         Raises:
@@ -910,6 +912,7 @@ def __init__(
             print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
 
         self.chat_format = chat_format
+        self.chat_handler = chat_handler
 
         self._n_vocab = self.n_vocab()
         self._n_ctx = self.n_ctx()
@@ -1918,7 +1921,9 @@ def create_chat_completion(
         Returns:
             Generated chat completion or a stream of chat completion chunks.
         """
-        handler = llama_chat_format.get_chat_completion_handler(self.chat_format)
+        handler = self.chat_handler or llama_chat_format.get_chat_completion_handler(
+            self.chat_format
+        )
         return handler(
             llama=self,
             messages=messages,
@@ -1982,6 +1987,7 @@ def __getstate__(self):
             numa=self.numa,
             # Chat Format Params
             chat_format=self.chat_format,
+            chat_handler=self.chat_handler,
             # Misc
             verbose=self.verbose,
         )
@@ -2023,6 +2029,7 @@ def __setstate__(self, state):
             numa=state["numa"],
             # Chat Format Params
             chat_format=state["chat_format"],
+            chat_handler=state["chat_handler"],
             # Misc
             verbose=state["verbose"],
         )

From 5ac81151665532471392b71eea959a23d17e2863 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 7 Nov 2023 15:13:44 -0500
Subject: [PATCH 18/26] Fix circular import

---
 llama_cpp/llama_chat_format.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index a10bc70cc..c029312da 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -4,8 +4,8 @@
 import dataclasses
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, Protocol
 
-from . import llama_types
-from . import llama
+import llama_cpp.llama_types as llama_types
+import llama_cpp.llama as llama
 
 
 class LlamaChatCompletionHandler(Protocol):

From cb749f2449e85120ba2c3603734a1517a2e20cc6 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 7 Nov 2023 15:28:35 -0500
Subject: [PATCH 19/26] Convert to absolute imports

---
 llama_cpp/llama.py         | 2 +-
 llama_cpp/llama_grammar.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 9cf50fc27..fd219e28c 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -21,9 +21,9 @@
 import diskcache
 import ctypes
 
-from . import llama_cpp
 from .llama_types import *
 from .llama_grammar import LlamaGrammar
+import llama_cpp.llama_cpp as llama_cpp
 import llama_cpp.llama_chat_format as llama_chat_format
 
 import numpy as np
diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py
index 29431d957..ccbea574b 100644
--- a/llama_cpp/llama_grammar.py
+++ b/llama_cpp/llama_grammar.py
@@ -19,7 +19,7 @@
     overload,
 )
 
-from . import llama_cpp
+import llama_cpp.llama_cpp as llama_cpp
 
 # Type aliases
 llama_grammar_element = llama_cpp.llama_grammar_element

From d2d2a2d470a12ad278966ad63fcc2ddbe038d5c1 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 7 Nov 2023 15:29:03 -0500
Subject: [PATCH 20/26] Fix

---
 llama_cpp/llama_types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index b49d8594c..3683dd844 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -230,7 +230,7 @@ class ChatCompletionRequestFunctionMessage(TypedDict):
 ]
 
 
-class ChatCompletionRequestFunctionCallOption:
+class ChatCompletionRequestFunctionCallOption(TypedDict):
     name: str
 
 

From 177114c368fd9dcc867135974af189654026a135 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 7 Nov 2023 22:12:43 -0500
Subject: [PATCH 21/26] Fix pydantic Jsontype bug

---
 llama_cpp/llama_types.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index 3683dd844..69d07fc92 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -10,6 +10,12 @@
 from typing_extensions import TypedDict, NotRequired, Literal
 
 
+# NOTE: Defining this correctly using annotations seems to break pydantic validation.
+#       This is a workaround until we can figure out how to do this correctly
+# JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]]
+JsonType = Union[None, int, str, bool, List[Any], Dict[str, Any]]
+
+
 class EmbeddingUsage(TypedDict):
     prompt_tokens: int
     total_tokens: int
@@ -72,7 +78,7 @@ class ChatCompletionResponseMessage(TypedDict):
 class ChatCompletionFunction(TypedDict):
     name: str
     description: NotRequired[str]
-    parameters: Dict[str, Any]  # TODO: make this more specific
+    parameters: Dict[str, JsonType]  # TODO: make this more specific
 
 
 class ChatCompletionResponseChoice(TypedDict):
@@ -136,9 +142,6 @@ class CreateChatCompletionStreamResponse(TypedDict):
     choices: List[ChatCompletionStreamResponseChoice]
 
 
-JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]]
-
-
 class ChatCompletionFunctions(TypedDict):
     name: str
     description: NotRequired[str]
@@ -238,7 +241,7 @@ class ChatCompletionRequestFunctionCallOption(TypedDict):
     Literal["none", "auto"], ChatCompletionRequestFunctionCallOption
 ]
 
-ChatCompletionFunctionParameters = Dict[str, JsonType]
+ChatCompletionFunctionParameters = Dict[str, JsonType] # TODO: make this more specific
 
 
 class ChatCompletionToolFunction(TypedDict):

From 21165e7d2b87092fc194a453d35f68001c5b8a5c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 7 Nov 2023 22:13:04 -0500
Subject: [PATCH 22/26] Accept list of prompt tokens in create_completion

---
 llama_cpp/llama.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index fd219e28c..7a2c34f45 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1279,7 +1279,7 @@ def embed(self, input: str) -> List[float]:
 
     def _create_completion(
         self,
-        prompt: str,
+        prompt: Union[str, List[int]],
         suffix: Optional[str] = None,
         max_tokens: int = 16,
         temperature: float = 0.8,
@@ -1314,7 +1314,7 @@ def _create_completion(
             self.tokenize(prompt.encode("utf-8"), special=True)
             if prompt != ""
             else [self.token_bos()]
-        )
+        ) if isinstance(prompt, str) else prompt
         text: bytes = b""
         returned_tokens: int = 0
         stop = (
@@ -1327,7 +1327,7 @@ def _create_completion(
 
         if len(prompt_tokens) >= self._n_ctx:
             raise ValueError(
-                f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self._ctx)}"
+                f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
             )
 
         if max_tokens <= 0:
@@ -1737,7 +1737,7 @@ def _create_completion(
 
     def create_completion(
         self,
-        prompt: str,
+        prompt: Union[str, List[int]],
         suffix: Optional[str] = None,
         max_tokens: int = 128,
         temperature: float = 0.8,

From 74c414c7eba6e3812fb8256262e6d448d2ce501b Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 7 Nov 2023 22:13:29 -0500
Subject: [PATCH 23/26] Add llava1.5 chat handler

---
 llama_cpp/llama_chat_format.py | 104 ++++++++++++++++++++++++++++-----
 llama_cpp/server/app.py        |  13 +++++
 2 files changed, 102 insertions(+), 15 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index c029312da..60b38d84d 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import os
+import ctypes
 import dataclasses
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, Protocol
 
@@ -725,6 +726,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
 
     assert "usage" in completion
     assert isinstance(function_call, str)
+    assert stream is False # TODO: support stream mode
 
     return llama_types.CreateChatCompletionResponse(
         id="chat" + completion["id"],
@@ -751,14 +753,40 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
 
 class Llava15ChatHandler:
     def __init__(self, clip_model_path: str):
+        import llama_cpp.llava_cpp as llava_cpp
+
+        self._llava_cpp = llava_cpp
         self.clip_model_path = clip_model_path
 
-    def chat_handler(
+        self.clip_ctx = self._llava_cpp.clip_model_load(self.clip_model_path.encode(), 0)
+
+    def __del__(self):
+        if self.clip_ctx is not None:
+            self._llava_cpp.clip_free(self.clip_ctx)
+            self.clip_ctx = None
+
+    def load_image(self, image_url: str) -> bytes:
+        if image_url.startswith("data:"):
+            import base64
+
+            image_bytes = base64.b64decode(image_url.split(",")[1])
+            return image_bytes
+        else:
+            import urllib.request
+
+            with urllib.request.urlopen(image_url) as f:
+                image_bytes = f.read()
+                return image_bytes
+
+    def __call__(
         self,
+        *,
         llama: llama.Llama,
         messages: List[llama_types.ChatCompletionRequestMessage],
         functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
         function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
+        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+        tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
         temperature: float = 0.2,
         top_p: float = 0.95,
         top_k: int = 40,
@@ -776,18 +804,64 @@ def chat_handler(
         logits_processor: Optional[llama.LogitsProcessorList] = None,
         grammar: Optional[llama.LlamaGrammar] = None,
         **kwargs,  # type: ignore
-    ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
-        # convert messages into a list of strings and images objects
-        # for each item in list
-        #  if string, process it and append to prompt
-        #  if image, evaluate it and add empty string to prompt (for now)
-        # generate completion
-        items = []
-        current_prompt = ""
-        system_prompt = ""
+    ) -> Union[llama_types.CreateChatCompletionResponse, Iterator[llama_types.CreateChatCompletionStreamResponse]]:
+        assert llama.context_params.logits_all is True # BUG: logits_all=True is required for llava
+        assert self.clip_ctx is not None
+        system_prompt = _get_system_message(messages)
+        system_prompt = system_prompt if system_prompt != "" else "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions."
+        system_prompt =  "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions."
+        user_role = "\nUSER:"
+        assistant_role = "\nASSISTANT:"
+        llama.reset()
+        llama.eval(llama.tokenize(system_prompt.encode("utf8"), add_bos=True))
         for message in messages:
-            if message["role"] == "system" and message["content"] is not None:
-                system_prompt = message["content"]
-            if message["role"] == "user":
-                items.append(message["content"])
-                current_prompt += message["content"]
\ No newline at end of file
+            if message["role"] == "user" and message["content"] is not None:
+                if isinstance(message["content"], str):
+                    llama.eval(llama.tokenize(f"{user_role} {message['content']}".encode("utf8"), add_bos=False))
+                else:
+                    assert isinstance(message["content"], list)
+                    llama.eval(llama.tokenize(f"{user_role} ".encode("utf8"), add_bos=False))
+                    for content in message["content"]:
+                        if content["type"] == "text":
+                            llama.eval(llama.tokenize(f"{content['text']}".encode("utf8"), add_bos=False))
+                        if content["type"] == "image_url":
+                            image_bytes = self.load_image(content["image_url"]["url"]) if isinstance(content["image_url"], dict) else self.load_image(content["image_url"])
+                            import array
+                            data_array =  array.array('B', image_bytes)
+                            c_ubyte_ptr = (ctypes.c_ubyte * len(data_array)).from_buffer(data_array)
+                            embed = self._llava_cpp.llava_image_embed_make_with_bytes(ctx_clip=self.clip_ctx, n_threads=llama.context_params.n_threads, image_bytes=c_ubyte_ptr, image_bytes_length=len(image_bytes))
+                            # image_bytes_p = (ctypes.c_uint8 * len(image_bytes)).from_buffer_copy(image_bytes)
+                            # embed = self._llava_cpp.llava_image_embed_make_with_bytes(ctx_clip=self.clip_ctx, n_threads=1, image_bytes=image_bytes_p, image_bytes_length=len(image_bytes))
+                            try:
+                                n_past = ctypes.c_int(llama.n_tokens)
+                                n_past_p = ctypes.pointer(n_past)
+                                self._llava_cpp.llava_eval_image_embed(ctx_llama=llama.ctx, embed=embed, n_batch=llama.n_batch, n_past=n_past_p)
+                                assert llama.n_ctx() >= n_past.value
+                                llama.n_tokens = n_past.value
+                            finally:
+                                self._llava_cpp.llava_image_embed_free(embed)
+            if message["role"] == "assistant" and message["content"] is not None:
+                llama.eval(llama.tokenize(f"ASSISTANT: {message['content']}".encode("utf8"), add_bos=False))
+        llama.eval(llama.tokenize(f"{assistant_role}".encode("utf8"), add_bos=False))
+
+        prompt = llama._input_ids.tolist()
+
+        return _convert_completion_to_chat(llama.create_completion(
+            prompt=prompt,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            stream=stream,
+            stop=stop,
+            max_tokens=max_tokens,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
+            repeat_penalty=repeat_penalty,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            model=model,
+            logits_processor=logits_processor,
+            grammar=grammar,
+        ), stream=stream)
\ No newline at end of file
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 261c58445..8ebc427ec 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -138,6 +138,10 @@ class Settings(BaseSettings):
         default="llama-2",
         description="Chat format to use.",
     )
+    clip_model_path: Optional[str] = Field(
+        default=None,
+        description="Path to a CLIP model to use for multi-modal chat completion.",
+    )
     # Cache Params
     cache: bool = Field(
         default=False,
@@ -375,6 +379,14 @@ def create_app(settings: Optional[Settings] = None):
     )
     app.include_router(router)
     global llama
+
+    ##
+    chat_handler = None
+    if settings.chat_format == "llava-1-5":
+        assert settings.clip_model_path is not None
+        chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(clip_model_path=settings.clip_model_path)
+    ##
+
     llama = llama_cpp.Llama(
         model_path=settings.model,
         # Model Params
@@ -411,6 +423,7 @@ def create_app(settings: Optional[Settings] = None):
         numa=settings.numa,
         # Chat Format Params
         chat_format=settings.chat_format,
+        chat_handler=chat_handler,
         # Misc
         verbose=settings.verbose,
     )

From 34aa8588f73ddcecbaccb4eee7c48a81e1e992b0 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 7 Nov 2023 22:14:48 -0500
Subject: [PATCH 24/26] Add Multimodal notebook

---
 examples/notebooks/Multimodal.ipynb | 84 +++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 examples/notebooks/Multimodal.ipynb

diff --git a/examples/notebooks/Multimodal.ipynb b/examples/notebooks/Multimodal.ipynb
new file mode 100644
index 000000000..11b14df38
--- /dev/null
+++ b/examples/notebooks/Multimodal.ipynb
@@ -0,0 +1,84 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ChatCompletion(id='chatcmpl-65a710ba-41d1-4d0a-a124-a44b2b4a0189', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content=' The image reads \"LlamaC++.\"', role='assistant', function_call=None, tool_calls=None))], created=1699413274, model='gpt-4-vision-preview', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=10, prompt_tokens=624, total_tokens=634))\n"
+     ]
+    }
+   ],
+   "source": [
+    "from openai import OpenAI\n",
+    "\n",
+    "import urllib.request\n",
+    "import base64\n",
+    "\n",
+    "def get_data_url(url):\n",
+    "    return \"data:image/png;base64,\" + base64.b64encode(urllib.request.urlopen(url).read()).decode(\"utf-8\")\n",
+    "\n",
+    "client = OpenAI(base_url=\"http://100.64.159.73:8000/v1\", api_key=\"sk-1234\")\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"gpt-4-vision-preview\",\n",
+    "    messages=[\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": [\n",
+    "                {\n",
+    "                    \"type\": \"image_url\",\n",
+    "                    \"image_url\": {\n",
+    "                        \"url\": get_data_url(\"https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png\"),\n",
+    "                        # \"url\": \"https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png\",\n",
+    "                    },\n",
+    "                },\n",
+    "                {\"type\": \"text\", \"text\": \"What does the image say\"},\n",
+    "            ],\n",
+    "        }\n",
+    "    ],\n",
+    ")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5+"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 66dda361bbe86e19017fda72df541042afe2ee90 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 7 Nov 2023 22:21:38 -0500
Subject: [PATCH 25/26] Clean up examples

---
 examples/multimodal/llava.py           |  95 -------------------------
 examples/multimodal/overfitting_lc.png | Bin 5982 -> 0 bytes
 2 files changed, 95 deletions(-)
 delete mode 100644 examples/multimodal/llava.py
 delete mode 100644 examples/multimodal/overfitting_lc.png

diff --git a/examples/multimodal/llava.py b/examples/multimodal/llava.py
deleted file mode 100644
index ac5c33a73..000000000
--- a/examples/multimodal/llava.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import ctypes
-import argparse
-import os
-import array
-import sys
-
-from llama_cpp import Llama
-from llama_cpp.llava_cpp import (
-    clip_model_load,
-    llava_image_embed_make_with_filename,
-    llava_image_embed_make_with_bytes,
-    llava_image_embed_free,
-    llava_validate_embed_size,
-    llava_eval_image_embed,
-)
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "-m", "--model", type=str, default="../models/llava-v1.5-7b/ggml-model-q5_k.gguf"
-)
-parser.add_argument("--mmproj", type=str, default="llava-v1.5-7b/mmproj-model-f16.gguf")
-parser.add_argument("-t", "--temp", type=float, default=0.1)
-parser.add_argument(
-    "-p", "--prompt", type=str, default="Describe this image in detail."
-)
-args = parser.parse_args()
-
-print(f"loading clip model from {args.mmproj}")
-if not os.path.exists(args.mmproj):
-    raise FileNotFoundError(args.mmproj)
-ctx_clip = clip_model_load(fname=args.mmproj.encode("utf-8"), verbosity=0)
-
-image_path = os.path.join(os.path.dirname(__file__), "overfitting_lc.png")
-if not os.path.exists(image_path):
-    raise FileNotFoundError(image_path)
-image_embed = llava_image_embed_make_with_filename(
-    ctx_clip=ctx_clip, n_threads=1, image_path=image_path.encode("utf8")
-)
-
-
-def load_image_embed_from_file_bytes(image_path: str):
-    with open(image_path, "rb") as file:
-        image_bytes = file.read()
-        bytes_length = len(image_bytes)
-        data_array = array.array("B", image_bytes)
-        c_ubyte_ptr = (ctypes.c_ubyte * len(data_array)).from_buffer(data_array)
-        return llava_image_embed_make_with_bytes(
-            ctx_clip=ctx_clip,
-            n_threads=1,
-            image_bytes=c_ubyte_ptr,
-            image_bytes_length=bytes_length,
-        )
-
-
-print(f"loading llm model from {args.model}")
-if not os.path.exists(args.model):
-    raise FileNotFoundError(args.model)
-llm = Llama(
-    model_path=args.model, n_ctx=2048, n_gpu_layers=1
-)  # longer context needed for image embeds
-
-if not llava_validate_embed_size(llm.ctx, ctx_clip):
-    raise RuntimeError("llm and mmproj model embed size mismatch")
-
-# eval system prompt
-system_prompt = "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\n"
-llm.eval(llm.tokenize(system_prompt.encode("utf8")))
-llm.eval(llm.tokenize("\nUSER: ".encode("utf8")))
-
-# eval image embed
-n_past = ctypes.c_int(llm.n_tokens)
-n_past_p = ctypes.pointer(n_past)
-llava_eval_image_embed(llm.ctx, image_embed, llm.n_batch, n_past_p)
-llm.n_tokens = n_past.value
-llava_image_embed_free(image_embed)
-
-# eval prompt
-prompt = "Describe the visual content of this image"
-llm.eval(llm.tokenize(prompt.encode("utf8")))
-llm.eval(llm.tokenize("\nASSISTANT:".encode("utf8")))
-
-# get output
-print("\n")
-max_target_len = 256
-for i in range(max_target_len):
-    t_id = llm.sample(temp=0.1)
-    t = llm.detokenize([t_id]).decode("utf8")
-    if t == "</s>":
-        break
-    print(t, end="")
-    sys.stdout.flush()
-    llm.eval([t_id])
-
-print("\n")
-print("done")
diff --git a/examples/multimodal/overfitting_lc.png b/examples/multimodal/overfitting_lc.png
deleted file mode 100644
index 591b34c68e1ca19bab4d790de6c98e70e773fdf9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 5982
zcmV-k7oq5hP)<h;3K|Lk000e1NJLTq00Arj009690{{R3KFzxd00093P)t-s|NsB{
z`~3gY*8g*REiEnn{{R2|{egjj1_lONT3Y}A|Mc|q{{8;=`1tzz`|pZJ^78Wi{QUp=
z_xbw#P-~y?jZ5;ETlx9<^!54v{{H{?^oE9p_xJcgUzk5$m;e6${pZ^D_VztJJ@17-
z@Q6h3hC!jBq2=Y}XJ=>p{QaDqod5j#`p>m=inHC_-Hwiq{N&cw*4FTkP}$kpNMxHz
zXPs?rZSU~*{NK*^`1@FKr03`7MPizXiHY^Gg}Au5rnAk3m%H(mS!#i<?C<r>)Z^*w
z^Wo+0Z-uZ!V3}iMV{mYA{`d0!@aUnh%khy@TXd!7>G9*|@M>ym|JvOD_4fb${zODX
z@|s{%ZJ~B{c3@y&;o;%`-QWNG`@X)u^{;|GT$cF2o%5Y${_^Yo_3!cX_uk^|@sn29
z-01w`(*NM%&CSjF)4Jp1<NfR5s<zOsuCCP7)VjL5mZij}r>B&Zl#!8<w7=DTeSPce
z>&3;zNJvP^%F0eoPWi*4^{syW>fVHeg!#y+^Pz0_xsrH{wf3`#``gEfn!We7j*Xqa
zn3$OKqj1N^$JN^Bke|U#Xr8gX)avT$$Isw%b94Xt`v2nPs;a6uS(N_s?fcrr^{9CA
zpK1EdvaGDEq@<*Ple%Ggsgk0?``yah-|CsD#>CCu!OGlwdwX47UH{kG|M2qv=IFMz
zw$jql_q&z!rgXl?+5i9mR#sNg*5&`}?fvcK_r00@?&VWcQ~TDwt+~-=eXG2@y#IfN
z|MT?!*4WU{(D3l^|M&QPkhuBBr_<Qx7#JAJ(&D<r*zD}=QBhH|v$OxDslvj-?d|RE
zgF5@z!MMWLovX-Scc|{}?m9X;|Cyivcz^%F!~c?*|L*Vqo}>Ti>izBG;Ns<Zd3ms~
zu>aB2|BRFWh>!lr%=N2#`O2&&CMKDgng6S>|GK{ag^d5Qx2>(M{QdnvKtTV_(G&{{
zsWV0LcS8QPr~X<|`A#*CYi$e$2HJp;(y6=IS8Hrmg^!Q2o{RSO000yjNkl<Zc-rlq
zX;@R&y2t-Hr$~8#446PRQ&7SvlY}YE^E_iOV?jU#DWeLgh$t$~BWO|4qBtOJain#?
zUJG@o!*Ryjb9>vzo^!9Z=k_+e=iGVx5Jc2~3E3eeS^rO)?C04l`LWi!-?i4e-WMPu
zA|fIpA|j$m%mN01Ag=~9*#v^zux1ii3c`{M6SJfM01Pt%W-M4UoNu4Y7yv?Y4RULk
z5IsF-u*{I4kqzr^Pad;2UP7(BOD%FALftaXV6z)RBO6w8GX-{LB^>MTnGJH#nVZKL
zBm;2_SQ#Gy)XGM*p18QX*Am9<7vt-P2pZY2p4tP;pX345P)N@J0Lw)_&IFBYSiKpy
z%tFpF2Adt20T~6%+QnwZ_BjO2n_?RnV$fn)eMSRd@3Tc;KLF(!b@h$|1kIb`oa6xP
z#t=6NV0Lox02`nx3+_4)GDJi~L_|bHL_|bHM7pVMiwFaG#a`hvgHVuDH8+0TMkvUq
zfZJ!!CXlWrm#FqXpFMkntog|YYry`-6Z<>@LOxp5I|kI`H@oK(&BXPNff^cp|8EE^
zx%Hdh5?FF;&fgPQa_hYhv;@|Ck69{j^sn}439KgGC6zb&;qSBr*6OVAeZ;`O4Hm-*
z03dkaIoDjpjehyp8V-wJ{ZQ(j8m+NwGEr@#pZrF{VJ$f19VNXo>EIn9lbdm2>7`#z
zy)NxrRmG+ARNCmL|D^E-ej)dAn$&FV=gpl@4E(sTjFtu-VgdkWU9(-2PhgGfIyhCw
zb@2fJ`Kws_RoLja|DXkamUu^8wbQV-WqN@a^zoh2NY}t9y<xKzoJe}s_#S*r1b6tA
z_}s2YWivQ!S)$p_jfepNn6F}OAh5=VMV$$35lhWhvFZ23&l-Os&FHsYB7G(Ld3MZG
z@0Vj&zP!HGIt5{Y-(}U$006-nyCRNS=Y|c<z%PrX*T0N9|G{Ph+w7+TEQnpb&gcO^
z;(ELN0(GuB^kx7V&C+XUi<h6DSPVE{oD#5*9<g&F0N}#fa2qulcw@;hEDi?%jE7+{
z835pO2aqrTfcc{98&hh6MfGLJN?(a(tY@lN2^@2!w6g(d4WbaREI#)xh29L{&RxJk
z|6GuT1<+a_-&4aJviLF65#_ajS#56o9R@5dDuXUIFINvNP*HKN(EuQDvD>7ANupq^
zXV&I0@k0(6yW58v<3;Gr04$tgxVt&vPG&PCb?&zoU38NS#@F6M&2Hc$$Gkkp8jAho
zM2M)Tol*}hP}NqOnE+tE&bESnvvuYDF>Z)*^UU;XV7!evcrw1>1+2Wx*`S&&Kn>ht
z=NWt<9fmc#Az2v$+{s|DpEN>JXTeYtEbw<&^&0^|(t&Uf+8dqo^B6aV67D+eo(*h^
zk55=}_XVuRi)^WZU*r>?^Q3nK*1abIz`$)Y_RK-RUZ1HhSP+K=E_DKcBwjJ?zUi0-
zo^p0?IAMr}%#+mM?h#l6=cKTPe!as)!b@0#GnWGex0z~(h3E)w9{`|&6-@U|;s;~K
zI+SNP+XJ0GfM`X*3s?@fPr^8-8E%<_a9H^fkWd5E>;}~n0?AJ~9QDJZuBRSO2LO}v
z!sj&mrejCZSmwryfJG~Mdv_azAjAg<a)z;cd~eY0AQ-N2{-XJ|J+LLA-1vta;I6@)
z_MUBE|2hbCHmea95XFs60|4{6yi3z)ZS<?}#=JGqO95zX{;9G&50wqTfCw;N+`M>T
z01yS$iDu6RhU}NI-L?VLBKLYXfO&@rh{`4EghlPL3S$Dm1yj>1S{r@;Z?$A1J=(}W
zvQB0I5Nz-~P3uaaR)!q$H^=Ro4uErzwT{kR>i7Ssg|HAA!TlNlOx6iqXf#`2eX5PH
zP;z)@3;^sq%_^g>(flh~35$x#j!g#uo2)GQjQj^$8dK`jy*F$D0Ko>~T-p{Ov^J)c
zsxI6i1|T?Sw~4Opo{xXlQdr=3S@$yln6I;Qp=kkP%vFa;&^!-S!a}60cefFMd0SKH
z3PqZxCQ^d=O<4yLRS65em!+Xw`%J^>c_peTrj%n-#A2O(kgrl$de>7A#{j?)&$YC8
z^6D}0NloFxxf_&muH}k@m$Z4eL<7h_$XZVatuMcs`~hhG+*H`%r`)%f^U%OUt3!MM
zM%V04(GYL?$sZ=0)%=QLyVFWXQOOBQe9Su834lqk@Sx_s@{P(mgjp5JyNU81M_I7Z
ztDgZNse`vc>37+8|2Ubb@~hVKO4lUO;P%LUc4vXB_ZbR6)qzq!W!s%p1D~`e#d8xq
zk$LhTbSgQ_EdhWs&#WZPfko{<VB6tGQ(V9Nu&C>)hf63xu-|j7lF>Dv{eF^A%_>_Z
zEUut!DNd0ibZuU)(E##YSabJ510=Z6#7ilnF>Fc^SX8xj7zJQflkGaCHub;#=ShK8
zQG9=H8J)473LVa5hFTvJ128GFEmFn|Ooq)e@$0Z%yMf-&1I13V@^{4bPX{o6lW>j6
zl!ldbikEVY?!ZL_kNx__dbQ~RNbF!Kv0n4(pH$tgDzb#Z9<+xoDtbmbJ=Xg=1z=u>
z?Sp(wO_}(ewq5I|f(I)Ki`wpg`6vKXZS{OGQCI%T$0{9xn|P(qRT4U^@acOh$}2V+
z0Fq2oCTc%FR^dt3`3J*U>#I}~IIMW80;8LOSA7A%rkHnRVtq?>2Hxbz0lTe!Mk;#x
z1oAqFL#>wZ0SL-W!zVUP{NV3Y^lPd%@k*bo&S9QN5{&6ftdge#5ZIKaOo-OBuT<i&
z&s)ReZQ7^8_5BHD4B;QLUMc|~D9b7>n*c49WDGH{D7ITw#sQ`nETqM9+b95>vXra|
z&{9Dz8iCDRp{YwEF!f*|@^JRo005hCPsK+f?FK%->Xgv4DN%qa2@Bi({nIf#5*2<Y
z(jCl}c+Hfx$wt+vCJxK#xcBh60*{o^BMN@cJ>SyWt&VK9JFvkVn6j{dHY+JDR2AD@
zQfwh)&u27xRvxEqvo6g7rZg;s#z{j?1vXoGE_>883|Ft?r4)TWNs~?`VDXQ|JrV<e
zAZaa2p*#?D8F<0IHKnZJs!7YnrA%fvW|y^;7Ul)tKd*q}Opogzn0GK-_{?JxgG`i4
zW4GXH;8N+;zEifr^3!_nLptnw#c8{>73P43z(Sg<_i<^~43i?()(i5!6!YIx-d`^}
z%`$aK6hL!eAuWR2CY|H&aq-NO*WprnB5hK(N@#j*(zlqD%_h0f-@in9#o3>dB4@fl
zWexo5O`fdtHUelGEGjDXaS1g%J=@AElIy>%)Z(9s%{rFn`XqI%X)B$llxo{Ke4rYb
z7c6`x&*8)?b0*vz$L17|x5<M8Xd*07ORZeR!`Jhhga^u0IXw2ADzy#vGXXRd7Mzv_
zUX6U=`}mo%ir%_V^5Mj`!uEKa67p89766(H3*u_)9ScT`?<(8%Z~HwHGVu8xr#xBe
z>s55Yg1F22*vKZq{57_wZ#m5Q8^vQvIo}6++O97%2Q(cP_+5pe;*r0~w%#AVbA6&{
zV95Lp&umLKRYA?7DQmayMyvkmBY!0o3A0@2o8Bh6JQJ3lvQhsAI9(~uePaV}zTB?c
zv!R%^YV2R9NHE*1q=Q+klxv?)g+$-9<%3Ui4eTD3v%|b8h1K-EKv@Hy=y#qc+^Xu0
zh)IWqlFQj`qtd}lD+{(Qe!fR3Sc#W9c*5WZ^8syxg_u-s-{|ByCi@St4#@na@`QE^
z5?7xuWrY`2nE+Y{OK<yQ|Ld=$8cIA^Y`68&z6p4dvsNGMu=9L&FcDKpbk?vi>f=ho
zkzVTiHKoF}F0Xfvw;e|@*+19RHoT~8YN4XJ;0M*mO^tbdG(9h7r4-qi3*>Z4%`Y4Y
z_GE<z`<YAwv>6s8$GH(DZ+Mgy&se;<2S>dc$29OJHZH-Y_f3Ock_4E7qPdg|quY9T
zH}Mr*W$Sph;d3v&7>=?JXL8D#)~499=2lE4o32V&K)2WBNLecee!=0w`)iy0R_}Ym
zSDlvlxySzVYi(^k51dz7Alj6KHGNm$)ihZ*rd@b&YAx%2_~}j8Tz>!kJ`*D&j>-Ha
zn<Ew1)~(9o-S^zO&c(w7Q(E*_bYd<oJQOMW)}DE*FReSk68_7-mxh~~nwpx1m!|Mo
zLZNB!8o#Q2Q@?VND#~B=$F)R`b9+zKzy90#^{YBMI-UiuYjWA}`7CINQ~^sIYPE4Z
zTB?gF)j(Jfhgye?M@vf+_e@Yu$yQ;$W8ePjLjr4*_>Zi1j7@9K9|^2iJcwlLvHKRO
zocX%ldStzMELz%_AW_+oSom^->@}P99f9?ljFPRc%I;hA2H<K1EQph>W&2yCG_tIb
z7+NUP-y#NHrmC%6WzZs}k!6p>YO6zUu0d!`X=J)-Ewy%igVwY?1eQ!uOT910=)E?P
z{vClOTg;nYu}AfYftPbu-R7OT@Rj{UjW3O?LK(IB`$svA`rwl<39PaER#dis8wHKm
zq`(4o^l)JJ%lIv=DcPqQ=^c&r+9f6iUQVSd70^2&GP-mc+A2rVgKt5RCW3d5p_
zftPpcbId9>QV+mqq-VWdjECIEeG+1`<aD^Yx&Cbgmb}h<)Zv{E#l+8&Q|T>nOBN6V
zFQ=lztpAS~czMN7{@?#uND)|a&J_RqFWGHQ1eSdI@V`>IyT+{DC~<hL+nLqeVdyR_
zLB6l6^_U@Cc`MCc{a{A3yP597y7E4Jm&CO!lpSoFw?c}B13+Rd0l<+K#k8b!5!Rfa
z0sg}{|291tHw!M_xLu2ak3yCiqaprwd>$k}giw~t@|ss9IOrfOsRzF>qVQ4Vo46!y
znT8VB=UU`Zq8<lHNRJuEeT&2}b9cc#$1M`wgEd^TFMXqROUW3+&^|K{ST16=Hm+!u
z!fF83`P5kVVC4^&M&^flryid!8`e_*Z0B=6PggidVYLE3<XGw+tdaOEz3UNHJECR6
zY6dv<j`uLQQpaJ<c^UbazR}wC`m1k6hfKB<mT^ue(EDqiZo?YAL?b@K<%aS{-!;By
zKO3Dv0M*{p`;&gIh*Re*F<jXS^e)puoAqjG<dUm}SGSLv{ZJ?C?!fIN0JFvJy@QQI
z)NB#O_`(vXEa&K;&3d&ovhS|A?CPm8p_z(r!z-`;ZM3i@&h`3q0&5&UYxJ^BHFsxn
zB!Tr713xUMzLvt+C?f*v4aqEx>~tozaAOIBz<P7lL58T9C0_o?z692*5{!|BKpYio
z9dTStV2#$;rI8ukhYMq)_ypFt27W|oq5fWBeFWBs#x0G^NbB0Ub8`ukz>?}MxoGra
zc5tmW_lXHCxy0z}3gm|MNeC>t@@FwiHfMXewk<Hyd028q(deZgd%QQczl0*N<dIX%
zuE&L`OO6`pG^}ss%_d3lqpotTBOb<xbr+T*q827c6lQNe6Rop7OQ8(7lC<L+t*uhK
zt}}EJmf~TO{4-ax_j+yaF5y!ImK^=-=&rErz5a(n`}n#AYhtC5DSpiH&0MRUkHgMH
ziy4%zz*5GyKN{NdIBu{1#{REMe2sJgmU5+$r>7qe4fEe?mFl{=x-E^N?XZ+S$!f%p
zE;+N~vbVSQPJh>9OZ%cD7l@}DX(ueY4h`_PKU{J(!ry!EUT(ypuF!|wUvDo-^W`(N
z3KpGd?50N^jq0xMYB{`f@7_YM?9{^%u`O3Oc0CI1-*vP5_}6{g7na1Nr$tBl^7&#W
z!$|XB(Ow$aXu6ocAhIOtX6TZzmWb5s>})QVyK|?vx3{;KmzS59S72aZU|?WeT-;`j
zgQe<>A&dplF-NyYwH-fmbLp=B>d@rmN5_sW>FV0CWAj6egQd<~G=mNr21}i%!e;-N
z&^SZ(tycZF=*Bzxldu2yi@w_A^X}OjPa9=!hn#%(EjoMl-O1N~{`p<?>Fn9FH|h*E
zXBg618d>3Cc;m-}N=>p^>dYpY7gTOW47|K<g%Zbr)Z;pk47dt7r6jkq{Gy>WvhKhl
z`QSCgz-t_=k4YLk`K3f@WZi?MsZvi&>7QtM%)q&mFl@Ld`=+Ga4PdUkLk#@z%%!dp
zm5#Gz-()Up2RQBnnh)z6dY)rQNksAGmVm@CV1@)>Fqv#n#zV`E0rt~QLkTQ7fP~#@
zNC6BcW9SwWpp0*^85#sj;nK*mHMoLZ+4JMdAV_L^at(SJa1L@`hDAQP=i*z@{Lah-
z5hLL)%ziP*H3*?llTd%|Nf0pL=xAUuq;m#-a3&jhjwdB>yDj4Mo(sU1+nqxU`(HT5
zIkm7n(axcmjs+MnSOn2+H?+^}WpB$ggnJO9^(QyYfu%@k<hQ`8bes*?CpZMSUA&7e
z=L|vJ3lKFVxJzvo)3Jc+^f3f-YSF&i5@s_UP}5U>Z;+uuHtV<a!qU%7NJ#K$#1?nJ
zxHdkg&M{~N7RSAw1L)VG{R9J?nU$E)?&wp$Tz6sBxdUKO*y0X|3<@YXzl23OH#33U
z+7VcQ*&@ew&4DFPX=FKJ1vuU{qw+3_X3X}<lfud!Fchs^22N1DC5P!)fbw{ep=d?k
z2rTE1*hBhtI%nY7xrQ+QDW`UZr1NS0(@rzA+XFM|6Y4GOgMbE~Eh2_~Bg|@Zf_!>`
zkU9<|xvd!RaSwVb(jeF2541dn<0b(H%Lf=R6K!)d25z+i7%XpWWmf`bZrg5Am27}B
zyFnxY1_LNiZcK0-8cz;U8f3G+(af$)*0ZMkM4A-*AWJkV5n23GKxA2&7<ko)Y=tQx
zvdOA|$o9Pg($v6^A+{PCrD^exh=_=Yh=_=Yh=_=Yh=_=Yh)4(Z|GvU!T`%_aIsgCw
M07*qoM6N<$f}MOg%K!iX


From 71adef432dc09d481d2f76abcd24859fbd6d2234 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 7 Nov 2023 22:42:13 -0500
Subject: [PATCH 26/26] Add server docs

---
 docs/server.md | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 docs/server.md

diff --git a/docs/server.md b/docs/server.md
new file mode 100644
index 000000000..e7d4bb6d6
--- /dev/null
+++ b/docs/server.md
@@ -0,0 +1,77 @@
+# OpenAI Compatible Server
+
+`llama-cpp-python` offers an OpenAI API compatible web server.
+
+This web server can be used to serve local models and easily connect them to existing clients.
+
+## Setup
+
+### Installation
+
+The server can be installed by running the following command:
+
+```bash
+pip install llama-cpp-python[server]
+```
+
+### Running the server
+
+The server can then be started by running the following command:
+
+```bash
+python3 -m llama_cpp.server --model <model_path>
+```
+
+### Server options
+
+For a full list of options, run:
+
+```bash
+python3 -m llama_cpp.server --help
+```
+
+NOTE: All server options are also available as environment variables. For example, `--model` can be set by setting the `MODEL` environment variable.
+
+## Guides
+
+### Multi-modal Models
+
+`llama-cpp-python` supports the llava1.5 family of multi-modal models which allow the language model to
+read information from both text and images.
+
+You'll first need to download one of the available multi-modal models in GGUF format:
+
+- [llava1.5 7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
+- [llava1.5 13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
+
+Then when you run the server you'll need to also specify the path to the clip model used for image embedding
+
+```bash
+python3 -m llama_cpp.server --model <model_path> --clip-model-path <clip_model_path>
+```
+
+Then you can just use the OpenAI API as normal
+
+```python3
+from openai import OpenAI
+
+client = OpenAI(base_url="http://<host>:<port>/v1", api_key="sk-xxx")
+response = client.chat.completions.create(
+    model="gpt-4-vision-preview",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "<image_url>"
+                    },
+                },
+                {"type": "text", "text": "What does the image say"},
+            ],
+        }
+    ],
+)
+print(response)
+```
\ No newline at end of file