@@ -165,12 +165,16 @@ class llama_token_data_array(Structure):
165165# int32_t n_gpu_layers; // number of layers to store in VRAM
166166# int32_t main_gpu; // the GPU that is used for scratch and small tensors
167167# float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
168+
169+ # // ref: https://github.com/ggerganov/llama.cpp/pull/2054
170+ # float rope_freq_base; // RoPE base frequency
171+ # float rope_freq_scale; // RoPE frequency scaling factor
172+
168173# // called with a progress value between 0 and 1, pass NULL to disable
169174# llama_progress_callback progress_callback;
170175# // context pointer passed to the progress callback
171176# void * progress_callback_user_data;
172177
173-
174178# // Keep the booleans together to avoid misalignment during copy-by-value.
175179# bool low_vram; // if true, reduce VRAM usage at the cost of performance
176180# bool f16_kv; // use fp16 for KV cache
@@ -190,6 +194,8 @@ class llama_context_params(Structure):
190194 ("n_gpu_layers" , c_int32 ),
191195 ("main_gpu" , c_int32 ),
192196 ("tensor_split" , c_float * LLAMA_MAX_DEVICES .value ),
197+ ("rope_freq_base" , c_float ),
198+ ("rope_freq_scale" , c_float ),
193199 ("progress_callback" , llama_progress_callback ),
194200 ("progress_callback_user_data" , c_void_p ),
195201 ("low_vram" , c_bool ),
@@ -328,13 +334,23 @@ def llama_mlock_supported() -> bool:
328334# // Initialize the llama + ggml backend
329335# // If numa is true, use NUMA optimizations
330336# // Call once at the start of the program
331- # LLAMA_API void llama_init_backend(bool numa);
332- def llama_init_backend (numa : c_bool ):
333- return _lib .llama_init_backend (numa )
337+ # LLAMA_API void llama_backend_init(bool numa);
338+ def llama_backend_init (numa : c_bool ):
339+ return _lib .llama_backend_init (numa )
340+
341+
342+ _lib .llama_backend_init .argtypes = [c_bool ]
343+ _lib .llama_backend_init .restype = None
344+
334345
346+ # // Call once at the end of the program - currently only used for MPI
347+ # LLAMA_API void llama_backend_free();
348+ def llama_backend_free ():
349+ return _lib .llama_backend_free ()
335350
336- _lib .llama_init_backend .argtypes = [c_bool ]
337- _lib .llama_init_backend .restype = None
351+
352+ _lib .llama_backend_free .argtypes = []
353+ _lib .llama_backend_free .restype = None
338354
339355
340356# LLAMA_API struct llama_model * llama_load_model_from_file(
@@ -648,6 +664,22 @@ def llama_tokenize(
648664_lib .llama_tokenize .restype = c_int
649665
650666
667+ # LLAMA_API int llama_tokenize_with_model(
668+ # const struct llama_model * model,
669+ # const char * text,
670+ # llama_token * tokens,
671+ # int n_max_tokens,
672+ # bool add_bos);
673+ def llama_tokenize_with_model (
674+ model : llama_model_p ,
675+ text : bytes ,
676+ tokens , # type: Array[llama_token]
677+ n_max_tokens : c_int ,
678+ add_bos : c_bool ,
679+ ) -> int :
680+ return _lib .llama_tokenize_with_model (model , text , tokens , n_max_tokens , add_bos )
681+
682+
651683# LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
652684def llama_n_vocab (ctx : llama_context_p ) -> int :
653685 return _lib .llama_n_vocab (ctx )
@@ -675,6 +707,33 @@ def llama_n_embd(ctx: llama_context_p) -> int:
675707_lib .llama_n_embd .restype = c_int
676708
677709
710+ # LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
711+ def llama_n_vocab_from_model (model : llama_model_p ) -> int :
712+ return _lib .llama_n_vocab_from_model (model )
713+
714+
715+ _lib .llama_n_vocab_from_model .argtypes = [llama_model_p ]
716+ _lib .llama_n_vocab_from_model .restype = c_int
717+
718+
719+ # LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
720+ def llama_n_ctx_from_model (model : llama_model_p ) -> int :
721+ return _lib .llama_n_ctx_from_model (model )
722+
723+
724+ _lib .llama_n_ctx_from_model .argtypes = [llama_model_p ]
725+ _lib .llama_n_ctx_from_model .restype = c_int
726+
727+
728+ # LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
729+ def llama_n_embd_from_model (model : llama_model_p ) -> int :
730+ return _lib .llama_n_embd_from_model (model )
731+
732+
733+ _lib .llama_n_embd_from_model .argtypes = [llama_model_p ]
734+ _lib .llama_n_embd_from_model .restype = c_int
735+
736+
678737# // Get the vocabulary as output parameters.
679738# // Returns number of results.
680739# LLAMA_API int llama_get_vocab(
@@ -695,6 +754,20 @@ def llama_get_vocab(
695754_lib .llama_get_vocab .restype = c_int
696755
697756
757+ # LLAMA_API int llama_get_vocab_from_model(
758+ # const struct llama_model * model,
759+ # const char * * strings,
760+ # float * scores,
761+ # int capacity);
762+ def llama_get_vocab_from_model (
763+ model : llama_model_p ,
764+ strings , # type: Array[c_char_p] # type: ignore
765+ scores , # type: Array[c_float] # type: ignore
766+ capacity : c_int ,
767+ ) -> int :
768+ return _lib .llama_get_vocab_from_model (model , strings , scores , capacity )
769+
770+
698771# Token logits obtained from the last call to llama_eval()
699772# The logits for the last token are stored in the last row
700773# Can be mutated in order to change the probabilities of the next token
@@ -724,15 +797,28 @@ def llama_get_embeddings(
724797_lib .llama_get_embeddings .restype = c_float_p
725798
726799
727- # Token Id -> String. Uses the vocabulary in the provided context
728- # LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
800+ # // Token Id -> String. Uses the vocabulary in the provided context
801+ # LLAMA_API const char * llama_token_to_str(
802+ # const struct llama_context * ctx,
803+ # llama_token token);
729804def llama_token_to_str (ctx : llama_context_p , token : llama_token ) -> bytes :
730805 return _lib .llama_token_to_str (ctx , token )
731806
732807
733808_lib .llama_token_to_str .argtypes = [llama_context_p , llama_token ]
734809_lib .llama_token_to_str .restype = c_char_p
735810
811+
812+ # LLAMA_API const char * llama_token_to_str_with_model(
813+ # const struct llama_model * model,
814+ # llama_token token);
815+ def llama_token_to_str_with_model (model : llama_model_p , token : llama_token ) -> bytes :
816+ return _lib .llama_token_to_str_with_model (model , token )
817+
818+
819+ _lib .llama_token_to_str_with_model .argtypes = [llama_model_p , llama_token ]
820+ _lib .llama_token_to_str_with_model .restype = c_char_p
821+
736822# Special tokens
737823
738824
@@ -821,6 +907,39 @@ def llama_sample_frequency_and_presence_penalties(
821907_lib .llama_sample_frequency_and_presence_penalties .restype = None
822908
823909
910+ # /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
911+ # /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
912+ # /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
913+ # /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
914+ # /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
915+ # LLAMA_API void llama_sample_classifier_free_guidance(
916+ # struct llama_context * ctx,
917+ # llama_token_data_array * candidates,
918+ # struct llama_context * guidance_ctx,
919+ # float scale,
920+ # float smooth_factor);
921+ def llama_sample_classifier_free_guidance (
922+ ctx : llama_context_p ,
923+ candidates , # type: _Pointer[llama_token_data_array]
924+ guidance_ctx : llama_context_p ,
925+ scale : c_float ,
926+ smooth_factor : c_float ,
927+ ):
928+ return _lib .llama_sample_classifier_free_guidance (
929+ ctx , candidates , guidance_ctx , scale , smooth_factor
930+ )
931+
932+
933+ _lib .llama_sample_classifier_free_guidance .argtypes = [
934+ llama_context_p ,
935+ llama_token_data_array_p ,
936+ llama_context_p ,
937+ c_float ,
938+ c_float ,
939+ ]
940+ _lib .llama_sample_classifier_free_guidance .restype = None
941+
942+
824943# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
825944# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
826945def llama_sample_softmax (
@@ -1065,5 +1184,5 @@ def llama_print_system_info() -> bytes:
10651184_llama_initialized = False
10661185
10671186if not _llama_initialized :
1068- llama_init_backend (c_bool (False ))
1187+ llama_backend_init (c_bool (False ))
10691188 _llama_initialized = True
0 commit comments