Update llama.cpp

abetlen · abetlen · commit 53861c9e530c · 2023-10-24T03:13:32.000-04:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -595,20 +595,14 @@ def _sample(
         candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p)
         candidates.sorted = llama_cpp.c_bool(False)
         candidates.size = llama_cpp.c_size_t(n_vocab)
-        llama_cpp.llama_sample_repetition_penalty(
-            ctx=self.ctx,
-            last_tokens_data=last_n_tokens_data,
-            last_tokens_size=last_n_tokens_size,
-            candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
-            penalty=repeat_penalty,
-        )
-        llama_cpp.llama_sample_frequency_and_presence_penalties(
+        llama_cpp.llama_sample_repetition_penalties(
             ctx=self.ctx,
             candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
             last_tokens_data=last_n_tokens_data,
-            last_tokens_size=last_n_tokens_size,
-            alpha_frequency=frequency_penalty,
-            alpha_presence=presence_penalty,
+            penalty_last_n=last_n_tokens_size,
+            penalty_repeat=repeat_penalty,
+            penalty_freq=frequency_penalty,
+            penalty_present=presence_penalty,
         )
         if not penalize_nl:
             candidates.data[self._token_nl].logit = llama_cpp.c_float(nl_logit)
@@ -1793,18 +1787,18 @@ def tokenizer(self) -> "LlamaTokenizer":
 
     def token_eos(self) -> int:
         """Return the end-of-sequence token."""
-        assert self.ctx is not None
-        return llama_cpp.llama_token_eos(self.ctx)
+        assert self.model is not None
+        return llama_cpp.llama_token_eos(self.model)
 
     def token_bos(self) -> int:
         """Return the beginning-of-sequence token."""
-        assert self.ctx is not None
-        return llama_cpp.llama_token_bos(self.ctx)
+        assert self.model is not None
+        return llama_cpp.llama_token_bos(self.model)
 
     def token_nl(self) -> int:
         """Return the newline token."""
-        assert self.ctx is not None
-        return llama_cpp.llama_token_nl(self.ctx)
+        assert self.model is not None
+        return llama_cpp.llama_token_nl(self.model)
 
     @staticmethod
     def logits_to_logprobs(logits: List[float]) -> List[float]:
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -1180,97 +1180,97 @@ def llama_get_embeddings(
 # //
 
 
-# LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
-def llama_token_get_text(ctx: llama_context_p, token: llama_token) -> bytes:
-    return _lib.llama_token_get_text(ctx, token)
+# LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
+def llama_token_get_text(model: llama_model_p, token: llama_token) -> bytes:
+    return _lib.llama_token_get_text(model, token)
 
 
-_lib.llama_token_get_text.argtypes = [llama_context_p, llama_token]
+_lib.llama_token_get_text.argtypes = [llama_model_p, llama_token]
 _lib.llama_token_get_text.restype = c_char_p
 
 
-# LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
-def llama_token_get_score(ctx: llama_context_p, token: llama_token) -> float:
-    return _lib.llama_token_get_score(ctx, token)
+# LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
+def llama_token_get_score(model: llama_model_p, token: llama_token) -> float:
+    return _lib.llama_token_get_score(model, token)
 
 
-_lib.llama_token_get_score.argtypes = [llama_context_p, llama_token]
+_lib.llama_token_get_score.argtypes = [llama_model_p, llama_token]
 _lib.llama_token_get_score.restype = c_float
 
 
-# LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
-def llama_token_get_type(ctx: llama_context_p, token: llama_token) -> int:
-    return _lib.llama_token_get_type(ctx, token)
+# LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
+def llama_token_get_type(model: llama_model_p, token: llama_token) -> int:
+    return _lib.llama_token_get_type(model, token)
 
 
-_lib.llama_token_get_type.argtypes = [llama_context_p, llama_token]
+_lib.llama_token_get_type.argtypes = [llama_model_p, llama_token]
 _lib.llama_token_get_type.restype = ctypes.c_int
 
 
 # // Special tokens
 
 
-# LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx);  // beginning-of-sentence
-def llama_token_bos(ctx: llama_context_p) -> int:
-    return _lib.llama_token_bos(ctx)
+# LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
+def llama_token_bos(model: llama_model_p) -> int:
+    return _lib.llama_token_bos(model)
 
 
-_lib.llama_token_bos.argtypes = [llama_context_p]
+_lib.llama_token_bos.argtypes = [llama_model_p]
 _lib.llama_token_bos.restype = llama_token
 
 
-# LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx);  // end-of-sentence
-def llama_token_eos(ctx: llama_context_p) -> int:
-    return _lib.llama_token_eos(ctx)
+# LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
+def llama_token_eos(model: llama_model_p) -> int:
+    return _lib.llama_token_eos(model)
 
 
-_lib.llama_token_eos.argtypes = [llama_context_p]
+_lib.llama_token_eos.argtypes = [llama_model_p]
 _lib.llama_token_eos.restype = llama_token
 
 
-# LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx);  // next-line
-def llama_token_nl(ctx: llama_context_p) -> int:
-    return _lib.llama_token_nl(ctx)
+# LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
+def llama_token_nl(model: llama_model_p) -> int:
+    return _lib.llama_token_nl(model)
 
 
-_lib.llama_token_nl.argtypes = [llama_context_p]
+_lib.llama_token_nl.argtypes = [llama_model_p]
 _lib.llama_token_nl.restype = llama_token
 
 
 # // codellama infill tokens
-# LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
-def llama_token_prefix(ctx: llama_context_p) -> int:
-    return _lib.llama_token_prefix(ctx)
+# LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
+def llama_token_prefix(model: llama_model_p) -> int:
+    return _lib.llama_token_prefix(model)
 
 
-_lib.llama_token_prefix.argtypes = [llama_context_p]
+_lib.llama_token_prefix.argtypes = [llama_model_p]
 _lib.llama_token_prefix.restype = llama_token
 
 
-# LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
-def llama_token_middle(ctx: llama_context_p) -> int:
-    return _lib.llama_token_middle(ctx)
+# LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
+def llama_token_middle(model: llama_model_p) -> int:
+    return _lib.llama_token_middle(model)
 
 
-_lib.llama_token_middle.argtypes = [llama_context_p]
+_lib.llama_token_middle.argtypes = [llama_model_p]
 _lib.llama_token_middle.restype = llama_token
 
 
-# LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
-def llama_token_suffix(ctx: llama_context_p) -> int:
-    return _lib.llama_token_suffix(ctx)
+# LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
+def llama_token_suffix(model: llama_model_p) -> int:
+    return _lib.llama_token_suffix(model)
 
 
-_lib.llama_token_suffix.argtypes = [llama_context_p]
+_lib.llama_token_suffix.argtypes = [llama_model_p]
 _lib.llama_token_suffix.restype = llama_token
 
 
-# LLAMA_API llama_token llama_token_eot   (const struct llama_context * ctx); // End of infill middle
-def llama_token_eot(ctx: llama_context_p) -> int:
-    return _lib.llama_token_eot(ctx)
+# LLAMA_API llama_token llama_token_eot   (const struct llama_model * model); // End of infill middle
+def llama_token_eot(model: llama_model_p) -> int:
+    return _lib.llama_token_eot(model)
 
 
-_lib.llama_token_eot.argtypes = [llama_context_p]
+_lib.llama_token_eot.argtypes = [llama_model_p]
 _lib.llama_token_eot.restype = llama_token
 
 
@@ -1431,70 +1431,46 @@ def llama_set_rng_seed(ctx: llama_context_p, seed: Union[c_uint32, int]):
 _lib.llama_set_rng_seed.restype = None
 
 
-# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-# LLAMA_API void llama_sample_repetition_penalty(
+# /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
+# /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
+# LLAMA_API void llama_sample_repetition_penalties(
 #         struct llama_context * ctx,
 #       llama_token_data_array * candidates,
 #            const llama_token * last_tokens,
-#                       size_t   last_tokens_size,
-#                       float    penalty);
-def llama_sample_repetition_penalty(
+#                       size_t   penalty_last_n,
+#                        float   penalty_repeat,
+#                        float   penalty_freq,
+#                        float   penalty_present);
+def llama_sample_repetition_penalties(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
     last_tokens_data,  # type: Array[llama_token]
-    last_tokens_size: Union[c_int, int],
-    penalty: Union[c_float, float],
+    penalty_last_n: Union[c_size_t, int],
+    penalty_repeat: Union[c_float, float],
+    penalty_freq: Union[c_float, float],
+    penalty_present: Union[c_float, float],
 ):
-    return _lib.llama_sample_repetition_penalty(
-        ctx, candidates, last_tokens_data, last_tokens_size, penalty
-    )
-
-
-_lib.llama_sample_repetition_penalty.argtypes = [
-    llama_context_p,
-    llama_token_data_array_p,
-    llama_token_p,
-    c_int,
-    c_float,
-]
-_lib.llama_sample_repetition_penalty.restype = None
-
-
-# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-# LLAMA_API void llama_sample_frequency_and_presence_penalties(
-#         struct llama_context * ctx,
-#       llama_token_data_array * candidates,
-#            const llama_token * last_tokens,
-#                       size_t   last_tokens_size,
-#                        float   alpha_frequency,
-#                        float   alpha_presence);
-def llama_sample_frequency_and_presence_penalties(
-    ctx: llama_context_p,
-    candidates,  # type: _Pointer[llama_token_data_array]
-    last_tokens_data,  # type: Array[llama_token]
-    last_tokens_size: Union[c_int, int],
-    alpha_frequency: Union[c_float, float],
-    alpha_presence: Union[c_float, float],
-):
-    return _lib.llama_sample_frequency_and_presence_penalties(
+    return _lib.llama_sample_repetition_penalties(
         ctx,
         candidates,
         last_tokens_data,
-        last_tokens_size,
-        alpha_frequency,
-        alpha_presence,
+        penalty_last_n,
+        penalty_repeat,
+        penalty_freq,
+        penalty_present,
     )
 
 
-_lib.llama_sample_frequency_and_presence_penalties.argtypes = [
+_lib.llama_sample_repetition_penalties.argtypes = [
     llama_context_p,
     llama_token_data_array_p,
     llama_token_p,
-    c_int,
+    c_size_t,
+    c_float,
     c_float,
     c_float,
 ]
-_lib.llama_sample_frequency_and_presence_penalties.restype = None
+_lib.llama_sample_repetition_penalties.restype = None
 
 
 # /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 8cf19d60dc93809db8e51fedc811595eed9134c5
+Subproject commit e3932593d46c30145301a13097895f9376cba509

Original file line number	Original file line	Diff line number	Diff line change
`@@ -595,20 +595,14 @@ def _sample(`
`595`	`candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p)`	`595`	`candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p)`
`596`	`candidates.sorted = llama_cpp.c_bool(False)`	`596`	`candidates.sorted = llama_cpp.c_bool(False)`
`597`	`candidates.size = llama_cpp.c_size_t(n_vocab)`	`597`	`candidates.size = llama_cpp.c_size_t(n_vocab)`
`598`	`- llama_cpp.llama_sample_repetition_penalty(`	`598`	`+ llama_cpp.llama_sample_repetition_penalties(`
`599`	`- ctx=self.ctx,`
`600`	`- last_tokens_data=last_n_tokens_data,`
`601`	`- last_tokens_size=last_n_tokens_size,`
`602`	`- candidates=llama_cpp.ctypes.byref(candidates), # type: ignore`
`603`	`- penalty=repeat_penalty,`
`604`	`- )`
`605`	`- llama_cpp.llama_sample_frequency_and_presence_penalties(`
`606`	`ctx=self.ctx,`	`599`	`ctx=self.ctx,`
`607`	`candidates=llama_cpp.ctypes.byref(candidates), # type: ignore`	`600`	`candidates=llama_cpp.ctypes.byref(candidates), # type: ignore`
`608`	`last_tokens_data=last_n_tokens_data,`	`601`	`last_tokens_data=last_n_tokens_data,`
`609`	`- last_tokens_size=last_n_tokens_size,`	`602`	`+ penalty_last_n=last_n_tokens_size,`
`610`	`- alpha_frequency=frequency_penalty,`	`603`	`+ penalty_repeat=repeat_penalty,`
`611`	`- alpha_presence=presence_penalty,`	`604`	`+ penalty_freq=frequency_penalty,`
		`605`	`+ penalty_present=presence_penalty,`
`612`	`)`	`606`	`)`
`613`	`if not penalize_nl:`	`607`	`if not penalize_nl:`
`614`	`candidates.data[self._token_nl].logit = llama_cpp.c_float(nl_logit)`	`608`	`candidates.data[self._token_nl].logit = llama_cpp.c_float(nl_logit)`
`@@ -1793,18 +1787,18 @@ def tokenizer(self) -> "LlamaTokenizer":`
`1793`		`1787`
`1794`	`def token_eos(self) -> int:`	`1788`	`def token_eos(self) -> int:`
`1795`	`"""Return the end-of-sequence token."""`	`1789`	`"""Return the end-of-sequence token."""`
`1796`	`- assert self.ctx is not None`	`1790`	`+ assert self.model is not None`
`1797`	`- return llama_cpp.llama_token_eos(self.ctx)`	`1791`	`+ return llama_cpp.llama_token_eos(self.model)`
`1798`		`1792`
`1799`	`def token_bos(self) -> int:`	`1793`	`def token_bos(self) -> int:`
`1800`	`"""Return the beginning-of-sequence token."""`	`1794`	`"""Return the beginning-of-sequence token."""`
`1801`	`- assert self.ctx is not None`	`1795`	`+ assert self.model is not None`
`1802`	`- return llama_cpp.llama_token_bos(self.ctx)`	`1796`	`+ return llama_cpp.llama_token_bos(self.model)`
`1803`		`1797`
`1804`	`def token_nl(self) -> int:`	`1798`	`def token_nl(self) -> int:`
`1805`	`"""Return the newline token."""`	`1799`	`"""Return the newline token."""`
`1806`	`- assert self.ctx is not None`	`1800`	`+ assert self.model is not None`
`1807`	`- return llama_cpp.llama_token_nl(self.ctx)`	`1801`	`+ return llama_cpp.llama_token_nl(self.model)`
`1808`		`1802`
`1809`	`@staticmethod`	`1803`	`@staticmethod`
`1810`	`def logits_to_logprobs(logits: List[float]) -> List[float]:`	`1804`	`def logits_to_logprobs(logits: List[float]) -> List[float]:`
Original file line number	Original file line	Diff line number	Diff line change
`@@ -1180,97 +1180,97 @@ def llama_get_embeddings(`
`1180`	`# //`	`1180`	`# //`
`1181`		`1181`
`1182`		`1182`
`1183`	`-# LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);`	`1183`	`+# LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);`
`1184`	`-def llama_token_get_text(ctx: llama_context_p, token: llama_token) -> bytes:`	`1184`	`+def llama_token_get_text(model: llama_model_p, token: llama_token) -> bytes:`
`1185`	`- return _lib.llama_token_get_text(ctx, token)`	`1185`	`+ return _lib.llama_token_get_text(model, token)`
`1186`		`1186`
`1187`		`1187`
`1188`	`-_lib.llama_token_get_text.argtypes = [llama_context_p, llama_token]`	`1188`	`+_lib.llama_token_get_text.argtypes = [llama_model_p, llama_token]`
`1189`	`_lib.llama_token_get_text.restype = c_char_p`	`1189`	`_lib.llama_token_get_text.restype = c_char_p`
`1190`		`1190`
`1191`		`1191`
`1192`	`-# LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);`	`1192`	`+# LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);`
`1193`	`-def llama_token_get_score(ctx: llama_context_p, token: llama_token) -> float:`	`1193`	`+def llama_token_get_score(model: llama_model_p, token: llama_token) -> float:`
`1194`	`- return _lib.llama_token_get_score(ctx, token)`	`1194`	`+ return _lib.llama_token_get_score(model, token)`
`1195`		`1195`
`1196`		`1196`
`1197`	`-_lib.llama_token_get_score.argtypes = [llama_context_p, llama_token]`	`1197`	`+_lib.llama_token_get_score.argtypes = [llama_model_p, llama_token]`
`1198`	`_lib.llama_token_get_score.restype = c_float`	`1198`	`_lib.llama_token_get_score.restype = c_float`
`1199`		`1199`
`1200`		`1200`
`1201`	`-# LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);`	`1201`	`+# LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);`
`1202`	`-def llama_token_get_type(ctx: llama_context_p, token: llama_token) -> int:`	`1202`	`+def llama_token_get_type(model: llama_model_p, token: llama_token) -> int:`
`1203`	`- return _lib.llama_token_get_type(ctx, token)`	`1203`	`+ return _lib.llama_token_get_type(model, token)`
`1204`		`1204`
`1205`		`1205`
`1206`	`-_lib.llama_token_get_type.argtypes = [llama_context_p, llama_token]`	`1206`	`+_lib.llama_token_get_type.argtypes = [llama_model_p, llama_token]`
`1207`	`_lib.llama_token_get_type.restype = ctypes.c_int`	`1207`	`_lib.llama_token_get_type.restype = ctypes.c_int`
`1208`		`1208`
`1209`		`1209`
`1210`	`# // Special tokens`	`1210`	`# // Special tokens`
`1211`		`1211`
`1212`		`1212`
`1213`	`-# LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence`	`1213`	`+# LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence`
`1214`	`-def llama_token_bos(ctx: llama_context_p) -> int:`	`1214`	`+def llama_token_bos(model: llama_model_p) -> int:`
`1215`	`- return _lib.llama_token_bos(ctx)`	`1215`	`+ return _lib.llama_token_bos(model)`
`1216`		`1216`
`1217`		`1217`
`1218`	`-_lib.llama_token_bos.argtypes = [llama_context_p]`	`1218`	`+_lib.llama_token_bos.argtypes = [llama_model_p]`
`1219`	`_lib.llama_token_bos.restype = llama_token`	`1219`	`_lib.llama_token_bos.restype = llama_token`
`1220`		`1220`
`1221`		`1221`
`1222`	`-# LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence`	`1222`	`+# LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence`
`1223`	`-def llama_token_eos(ctx: llama_context_p) -> int:`	`1223`	`+def llama_token_eos(model: llama_model_p) -> int:`
`1224`	`- return _lib.llama_token_eos(ctx)`	`1224`	`+ return _lib.llama_token_eos(model)`
`1225`		`1225`
`1226`		`1226`
`1227`	`-_lib.llama_token_eos.argtypes = [llama_context_p]`	`1227`	`+_lib.llama_token_eos.argtypes = [llama_model_p]`
`1228`	`_lib.llama_token_eos.restype = llama_token`	`1228`	`_lib.llama_token_eos.restype = llama_token`
`1229`		`1229`
`1230`		`1230`
`1231`	`-# LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line`	`1231`	`+# LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line`
`1232`	`-def llama_token_nl(ctx: llama_context_p) -> int:`	`1232`	`+def llama_token_nl(model: llama_model_p) -> int:`
`1233`	`- return _lib.llama_token_nl(ctx)`	`1233`	`+ return _lib.llama_token_nl(model)`
`1234`		`1234`
`1235`		`1235`
`1236`	`-_lib.llama_token_nl.argtypes = [llama_context_p]`	`1236`	`+_lib.llama_token_nl.argtypes = [llama_model_p]`
`1237`	`_lib.llama_token_nl.restype = llama_token`	`1237`	`_lib.llama_token_nl.restype = llama_token`
`1238`		`1238`
`1239`		`1239`
`1240`	`# // codellama infill tokens`	`1240`	`# // codellama infill tokens`
`1241`	`-# LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix`	`1241`	`+# LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix`
`1242`	`-def llama_token_prefix(ctx: llama_context_p) -> int:`	`1242`	`+def llama_token_prefix(model: llama_model_p) -> int:`
`1243`	`- return _lib.llama_token_prefix(ctx)`	`1243`	`+ return _lib.llama_token_prefix(model)`
`1244`		`1244`
`1245`		`1245`
`1246`	`-_lib.llama_token_prefix.argtypes = [llama_context_p]`	`1246`	`+_lib.llama_token_prefix.argtypes = [llama_model_p]`
`1247`	`_lib.llama_token_prefix.restype = llama_token`	`1247`	`_lib.llama_token_prefix.restype = llama_token`
`1248`		`1248`
`1249`		`1249`
`1250`	`-# LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle`	`1250`	`+# LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle`
`1251`	`-def llama_token_middle(ctx: llama_context_p) -> int:`	`1251`	`+def llama_token_middle(model: llama_model_p) -> int:`
`1252`	`- return _lib.llama_token_middle(ctx)`	`1252`	`+ return _lib.llama_token_middle(model)`
`1253`		`1253`
`1254`		`1254`
`1255`	`-_lib.llama_token_middle.argtypes = [llama_context_p]`	`1255`	`+_lib.llama_token_middle.argtypes = [llama_model_p]`
`1256`	`_lib.llama_token_middle.restype = llama_token`	`1256`	`_lib.llama_token_middle.restype = llama_token`
`1257`		`1257`
`1258`		`1258`
`1259`	`-# LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix`	`1259`	`+# LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix`
`1260`	`-def llama_token_suffix(ctx: llama_context_p) -> int:`	`1260`	`+def llama_token_suffix(model: llama_model_p) -> int:`
`1261`	`- return _lib.llama_token_suffix(ctx)`	`1261`	`+ return _lib.llama_token_suffix(model)`
`1262`		`1262`
`1263`		`1263`
`1264`	`-_lib.llama_token_suffix.argtypes = [llama_context_p]`	`1264`	`+_lib.llama_token_suffix.argtypes = [llama_model_p]`
`1265`	`_lib.llama_token_suffix.restype = llama_token`	`1265`	`_lib.llama_token_suffix.restype = llama_token`
`1266`		`1266`
`1267`		`1267`
`1268`	`-# LLAMA_API llama_token llama_token_eot (const struct llama_context * ctx); // End of infill middle`	`1268`	`+# LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle`
`1269`	`-def llama_token_eot(ctx: llama_context_p) -> int:`	`1269`	`+def llama_token_eot(model: llama_model_p) -> int:`
`1270`	`- return _lib.llama_token_eot(ctx)`	`1270`	`+ return _lib.llama_token_eot(model)`
`1271`		`1271`
`1272`		`1272`
`1273`	`-_lib.llama_token_eot.argtypes = [llama_context_p]`	`1273`	`+_lib.llama_token_eot.argtypes = [llama_model_p]`
`1274`	`_lib.llama_token_eot.restype = llama_token`	`1274`	`_lib.llama_token_eot.restype = llama_token`
`1275`		`1275`
`1276`		`1276`
`@@ -1431,70 +1431,46 @@ def llama_set_rng_seed(ctx: llama_context_p, seed: Union[c_uint32, int]):`
`1431`	`_lib.llama_set_rng_seed.restype = None`	`1431`	`_lib.llama_set_rng_seed.restype = None`
`1432`		`1432`
`1433`		`1433`
`1434`	`-# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.`	`1434`	`+# /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.`
`1435`	`-# LLAMA_API void llama_sample_repetition_penalty(`	`1435`	`+# /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.`
		`1436`	`+# LLAMA_API void llama_sample_repetition_penalties(`
`1436`	`# struct llama_context * ctx,`	`1437`	`# struct llama_context * ctx,`
`1437`	`# llama_token_data_array * candidates,`	`1438`	`# llama_token_data_array * candidates,`
`1438`	`# const llama_token * last_tokens,`	`1439`	`# const llama_token * last_tokens,`
`1439`	`-# size_t last_tokens_size,`	`1440`	`+# size_t penalty_last_n,`
`1440`	`-# float penalty);`	`1441`	`+# float penalty_repeat,`
`1441`	`-def llama_sample_repetition_penalty(`	`1442`	`+# float penalty_freq,`
		`1443`	`+# float penalty_present);`
		`1444`	`+def llama_sample_repetition_penalties(`
`1442`	`ctx: llama_context_p,`	`1445`	`ctx: llama_context_p,`
`1443`	`candidates, # type: _Pointer[llama_token_data_array]`	`1446`	`candidates, # type: _Pointer[llama_token_data_array]`
`1444`	`last_tokens_data, # type: Array[llama_token]`	`1447`	`last_tokens_data, # type: Array[llama_token]`
`1445`	`- last_tokens_size: Union[c_int, int],`	`1448`	`+ penalty_last_n: Union[c_size_t, int],`
`1446`	`- penalty: Union[c_float, float],`	`1449`	`+ penalty_repeat: Union[c_float, float],`
		`1450`	`+ penalty_freq: Union[c_float, float],`
		`1451`	`+ penalty_present: Union[c_float, float],`
`1447`	`):`	`1452`	`):`
`1448`	`- return _lib.llama_sample_repetition_penalty(`	`1453`	`+ return _lib.llama_sample_repetition_penalties(`
`1449`	`- ctx, candidates, last_tokens_data, last_tokens_size, penalty`
`1450`	`- )`
`1451`	`-`
`1452`	`-`
`1453`	`-_lib.llama_sample_repetition_penalty.argtypes = [`
`1454`	`- llama_context_p,`
`1455`	`- llama_token_data_array_p,`
`1456`	`- llama_token_p,`
`1457`	`- c_int,`
`1458`	`- c_float,`
`1459`	`-]`
`1460`	`-_lib.llama_sample_repetition_penalty.restype = None`
`1461`	`-`
`1462`	`-`
`1463`	`-# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.`
`1464`	`-# LLAMA_API void llama_sample_frequency_and_presence_penalties(`
`1465`	`-# struct llama_context * ctx,`
`1466`	`-# llama_token_data_array * candidates,`
`1467`	`-# const llama_token * last_tokens,`
`1468`	`-# size_t last_tokens_size,`
`1469`	`-# float alpha_frequency,`
`1470`	`-# float alpha_presence);`
`1471`	`-def llama_sample_frequency_and_presence_penalties(`
`1472`	`- ctx: llama_context_p,`
`1473`	`- candidates, # type: _Pointer[llama_token_data_array]`
`1474`	`- last_tokens_data, # type: Array[llama_token]`
`1475`	`- last_tokens_size: Union[c_int, int],`
`1476`	`- alpha_frequency: Union[c_float, float],`
`1477`	`- alpha_presence: Union[c_float, float],`
`1478`	`-):`
`1479`	`- return _lib.llama_sample_frequency_and_presence_penalties(`
`1480`	`ctx,`	`1454`	`ctx,`
`1481`	`candidates,`	`1455`	`candidates,`
`1482`	`last_tokens_data,`	`1456`	`last_tokens_data,`
`1483`	`- last_tokens_size,`	`1457`	`+ penalty_last_n,`
`1484`	`- alpha_frequency,`	`1458`	`+ penalty_repeat,`
`1485`	`- alpha_presence,`	`1459`	`+ penalty_freq,`
		`1460`	`+ penalty_present,`
`1486`	`)`	`1461`	`)`
`1487`		`1462`
`1488`		`1463`
`1489`	`-_lib.llama_sample_frequency_and_presence_penalties.argtypes = [`	`1464`	`+_lib.llama_sample_repetition_penalties.argtypes = [`
`1490`	`llama_context_p,`	`1465`	`llama_context_p,`
`1491`	`llama_token_data_array_p,`	`1466`	`llama_token_data_array_p,`
`1492`	`llama_token_p,`	`1467`	`llama_token_p,`
`1493`	`- c_int,`	`1468`	`+ c_size_t,`
		`1469`	`+ c_float,`
`1494`	`c_float,`	`1470`	`c_float,`
`1495`	`c_float,`	`1471`	`c_float,`
`1496`	`]`	`1472`	`]`
`1497`	`-_lib.llama_sample_frequency_and_presence_penalties.restype = None`	`1473`	`+_lib.llama_sample_repetition_penalties.restype = None`
`1498`		`1474`
`1499`		`1475`
`1500`	`# /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806`	`1476`	`# /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806`