@@ -506,7 +506,7 @@ def llama_mlock_supported() -> bool:
506
506
_lib .llama_mlock_supported .restype = c_bool
507
507
508
508
509
- # LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
509
+ # LLAMA_API int llama_n_vocab (const struct llama_context * ctx);
510
510
def llama_n_vocab (ctx : llama_context_p ) -> int :
511
511
return _lib .llama_n_vocab (ctx )
512
512
@@ -515,7 +515,7 @@ def llama_n_vocab(ctx: llama_context_p) -> int:
515
515
_lib .llama_n_vocab .restype = c_int
516
516
517
517
518
- # LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
518
+ # LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
519
519
def llama_n_ctx (ctx : llama_context_p ) -> int :
520
520
return _lib .llama_n_ctx (ctx )
521
521
@@ -524,7 +524,16 @@ def llama_n_ctx(ctx: llama_context_p) -> int:
524
524
_lib .llama_n_ctx .restype = c_int
525
525
526
526
527
- # LLAMA_API int llama_n_embd (const struct llama_context * ctx);
527
+ # LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
528
+ def llama_n_ctx_train (ctx : llama_context_p ) -> int :
529
+ return _lib .llama_n_ctx_train (ctx )
530
+
531
+
532
+ _lib .llama_n_ctx_train .argtypes = [llama_context_p ]
533
+ _lib .llama_n_ctx_train .restype = c_int
534
+
535
+
536
+ # LLAMA_API int llama_n_embd (const struct llama_context * ctx);
528
537
def llama_n_embd (ctx : llama_context_p ) -> int :
529
538
return _lib .llama_n_embd (ctx )
530
539
@@ -542,7 +551,7 @@ def llama_vocab_type(ctx: llama_context_p) -> int:
542
551
_lib .llama_vocab_type .restype = c_int
543
552
544
553
545
- # LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
554
+ # LLAMA_API int llama_model_n_vocab (const struct llama_model * model);
546
555
def llama_model_n_vocab (model : llama_model_p ) -> int :
547
556
return _lib .llama_model_n_vocab (model )
548
557
@@ -551,7 +560,7 @@ def llama_model_n_vocab(model: llama_model_p) -> int:
551
560
_lib .llama_model_n_vocab .restype = c_int
552
561
553
562
554
- # LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
563
+ # LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
555
564
def llama_model_n_ctx (model : llama_model_p ) -> int :
556
565
return _lib .llama_model_n_ctx (model )
557
566
@@ -560,7 +569,16 @@ def llama_model_n_ctx(model: llama_model_p) -> int:
560
569
_lib .llama_model_n_ctx .restype = c_int
561
570
562
571
563
- # LLAMA_API int llama_model_n_embd (const struct llama_model * model);
572
+ # LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model);
573
+ def llama_model_n_ctx_train (model : llama_model_p ) -> int :
574
+ return _lib .llama_model_n_ctx_train (model )
575
+
576
+
577
+ _lib .llama_model_n_ctx_train .argtypes = [llama_model_p ]
578
+ _lib .llama_model_n_ctx_train .restype = c_int
579
+
580
+
581
+ # LLAMA_API int llama_model_n_embd (const struct llama_model * model);
564
582
def llama_model_n_embd (model : llama_model_p ) -> int :
565
583
return _lib .llama_model_n_embd (model )
566
584
@@ -1046,74 +1064,14 @@ def llama_grammar_free(grammar: llama_grammar_p):
1046
1064
_lib .llama_grammar_free .argtypes = [llama_grammar_p ]
1047
1065
_lib .llama_grammar_free .restype = None
1048
1066
1049
- # //
1050
- # // Beam search
1051
- # //
1052
-
1053
-
1054
- # struct llama_beam_view {
1055
- # const llama_token * tokens;
1056
- # size_t n_tokens;
1057
- # float p; // Cumulative beam probability (renormalized relative to all beams)
1058
- # bool eob; // Callback should set this to true when a beam is at end-of-beam.
1059
- # };
1060
- class llama_beam_view (ctypes .Structure ):
1061
- _fields_ = [
1062
- ("tokens" , llama_token_p ),
1063
- ("n_tokens" , c_size_t ),
1064
- ("p" , c_float ),
1065
- ("eob" , c_bool ),
1066
- ]
1067
1067
1068
+ # LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
1069
+ def llama_grammar_copy (grammar : llama_grammar_p ) -> llama_grammar_p :
1070
+ return _lib .llama_grammar_copy (grammar )
1068
1071
1069
- # // Passed to beam_search_callback function.
1070
- # // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
1071
- # // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
1072
- # // These pointers are valid only during the synchronous callback, so should not be saved.
1073
- # struct llama_beams_state {
1074
- # struct llama_beam_view * beam_views;
1075
- # size_t n_beams; // Number of elements in beam_views[].
1076
- # size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
1077
- # bool last_call; // True iff this is the last callback invocation.
1078
- # };
1079
- class llama_beams_state (ctypes .Structure ):
1080
- _fields_ = [
1081
- ("beam_views" , POINTER (llama_beam_view )),
1082
- ("n_beams" , c_size_t ),
1083
- ("common_prefix_length" , c_size_t ),
1084
- ("last_call" , c_bool ),
1085
- ]
1086
-
1087
-
1088
- # // Type of pointer to the beam_search_callback function.
1089
- # // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
1090
- # // passed back to beam_search_callback. This avoids having to use global variables in the callback.
1091
- # typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
1092
- llama_beam_search_callback_fn_t = ctypes .CFUNCTYPE (None , c_void_p , llama_beams_state )
1093
-
1094
-
1095
- # /// @details Deterministically returns entire sentence constructed by a beam search.
1096
- # /// @param ctx Pointer to the llama_context.
1097
- # /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
1098
- # /// @param callback_data A pointer that is simply passed back to callback.
1099
- # /// @param n_beams Number of beams to use.
1100
- # /// @param n_past Number of tokens already evaluated.
1101
- # /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
1102
- # /// @param n_threads Number of threads as passed to llama_eval().
1103
- # LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
1104
- def llama_beam_search (
1105
- ctx : llama_context_p ,
1106
- callback : "ctypes._CFuncPtr[None, c_void_p, llama_beams_state]" , # type: ignore
1107
- callback_data : c_void_p ,
1108
- n_beams : c_size_t ,
1109
- n_past : c_int ,
1110
- n_predict : c_int ,
1111
- n_threads : c_int ,
1112
- ):
1113
- return _lib .llama_beam_search (
1114
- ctx , callback , callback_data , n_beams , n_past , n_predict , n_threads
1115
- )
1116
1072
1073
+ _lib .llama_grammar_copy .argtypes = [llama_grammar_p ]
1074
+ _lib .llama_grammar_copy .restype = llama_grammar_p
1117
1075
1118
1076
# //
1119
1077
# // Sampling functions
@@ -1436,6 +1394,74 @@ def llama_grammar_accept_token(
1436
1394
llama_token ,
1437
1395
]
1438
1396
_lib .llama_grammar_accept_token .restype = None
1397
+ # //
1398
+ # // Beam search
1399
+ # //
1400
+
1401
+
1402
+ # struct llama_beam_view {
1403
+ # const llama_token * tokens;
1404
+ # size_t n_tokens;
1405
+ # float p; // Cumulative beam probability (renormalized relative to all beams)
1406
+ # bool eob; // Callback should set this to true when a beam is at end-of-beam.
1407
+ # };
1408
+ class llama_beam_view (ctypes .Structure ):
1409
+ _fields_ = [
1410
+ ("tokens" , llama_token_p ),
1411
+ ("n_tokens" , c_size_t ),
1412
+ ("p" , c_float ),
1413
+ ("eob" , c_bool ),
1414
+ ]
1415
+
1416
+
1417
+ # // Passed to beam_search_callback function.
1418
+ # // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
1419
+ # // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
1420
+ # // These pointers are valid only during the synchronous callback, so should not be saved.
1421
+ # struct llama_beams_state {
1422
+ # struct llama_beam_view * beam_views;
1423
+ # size_t n_beams; // Number of elements in beam_views[].
1424
+ # size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
1425
+ # bool last_call; // True iff this is the last callback invocation.
1426
+ # };
1427
+ class llama_beams_state (ctypes .Structure ):
1428
+ _fields_ = [
1429
+ ("beam_views" , POINTER (llama_beam_view )),
1430
+ ("n_beams" , c_size_t ),
1431
+ ("common_prefix_length" , c_size_t ),
1432
+ ("last_call" , c_bool ),
1433
+ ]
1434
+
1435
+
1436
+ # // Type of pointer to the beam_search_callback function.
1437
+ # // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
1438
+ # // passed back to beam_search_callback. This avoids having to use global variables in the callback.
1439
+ # typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
1440
+ llama_beam_search_callback_fn_t = ctypes .CFUNCTYPE (None , c_void_p , llama_beams_state )
1441
+
1442
+
1443
+ # /// @details Deterministically returns entire sentence constructed by a beam search.
1444
+ # /// @param ctx Pointer to the llama_context.
1445
+ # /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
1446
+ # /// @param callback_data A pointer that is simply passed back to callback.
1447
+ # /// @param n_beams Number of beams to use.
1448
+ # /// @param n_past Number of tokens already evaluated.
1449
+ # /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
1450
+ # /// @param n_threads Number of threads as passed to llama_eval().
1451
+ # LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
1452
+ def llama_beam_search (
1453
+ ctx : llama_context_p ,
1454
+ callback : "ctypes._CFuncPtr[None, c_void_p, llama_beams_state]" , # type: ignore
1455
+ callback_data : c_void_p ,
1456
+ n_beams : c_size_t ,
1457
+ n_past : c_int ,
1458
+ n_predict : c_int ,
1459
+ n_threads : c_int ,
1460
+ ):
1461
+ return _lib .llama_beam_search (
1462
+ ctx , callback , callback_data , n_beams , n_past , n_predict , n_threads
1463
+ )
1464
+
1439
1465
1440
1466
# Performance information
1441
1467
@@ -1494,6 +1520,7 @@ def llama_log_set(
1494
1520
def llama_dump_timing_info_yaml (stream : ctypes .c_void_p , ctx : llama_context_p ):
1495
1521
return _lib .llama_dump_timing_info_yaml (stream , ctx )
1496
1522
1523
+
1497
1524
_lib .llama_dump_timing_info_yaml .argtypes = [ctypes .c_void_p , llama_context_p ]
1498
1525
_lib .llama_dump_timing_info_yaml .restype = None
1499
1526
0 commit comments