abetlen
diff --git a/‎CHANGELOG.md‎
Lines changed: 10 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎llama_cpp/llama.py‎
Lines changed: 15 additions & 1 deletion b/‎llama_cpp/llama.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎llama_cpp/llama_cpp.py‎
Lines changed: 128 additions & 9 deletions b/‎llama_cpp/llama_cpp.py‎
Lines changed: 128 additions & 9 deletions
@@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.1.71]
+
+### Added
+
+- (llama.cpp) Update llama.cpp
+
+### Fixed
+
+- (server) Fix several pydantic v2 migration bugs
+
 ## [0.1.70]
 
 ### Fixed
 
@@ -135,6 +135,7 @@ A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python).
 ```bash
 docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
 ```
+[Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389) 
 
 ## Low-level API
 
 
@@ -19,14 +19,14 @@
 from collections import deque, OrderedDict
 
 import diskcache
+import ctypes
 
 from . import llama_cpp
 from .llama_types import *
 
 import numpy as np
 import numpy.typing as npt
 
-
 class BaseLlamaCache(ABC):
     """Base cache class for a llama.cpp model."""
 
@@ -222,6 +222,7 @@ def __init__(
         lora_base: Optional[str] = None,
         lora_path: Optional[str] = None,
         low_vram: bool = False,
+        tensor_split: Optional[List[float]] = None,
         verbose: bool = True,
     ):
         """Load a llama.cpp model from `model_path`.
@@ -244,6 +245,7 @@ def __init__(
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
             lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
             lora_path: Path to a LoRA file to apply to the model.
+            tensor_split: List of floats to split the model across multiple GPUs. If None, the model is not split.
             verbose: Print verbose output to stderr.
 
         Raises:
@@ -252,6 +254,7 @@ def __init__(
         Returns:
             A Llama instance.
         """
+
         self.verbose = verbose
         self.model_path = model_path
 
@@ -269,6 +272,15 @@ def __init__(
         self.params.embedding = embedding
         self.params.low_vram = low_vram
 
+        self.tensor_split = tensor_split
+        self._c_tensor_split = None
+
+        if self.tensor_split is not None:
+            #Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
+            FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value
+            self._c_tensor_split = FloatArray(*tensor_split) # keep a reference to the array so it is not gc'd
+            self.params.tensor_split = self._c_tensor_split
+
         self.last_n_tokens_size = last_n_tokens_size
         self.n_batch = min(n_ctx, n_batch)
 
@@ -1509,6 +1521,7 @@ def __getstate__(self):
             n_threads=self.n_threads,
             lora_base=self.lora_base,
             lora_path=self.lora_path,
+            tensor_split=self.tensor_split,
             ### DEPRECATED ###
             n_parts=self.n_parts,
             ### DEPRECATED ###
@@ -1533,6 +1546,7 @@ def __setstate__(self, state):
             last_n_tokens_size=state["last_n_tokens_size"],
             lora_base=state["lora_base"],
             lora_path=state["lora_path"],
+            tensor_split=state["tensor_split"],
             verbose=state["verbose"],
         )
 
 
@@ -165,12 +165,16 @@ class llama_token_data_array(Structure):
 #     int32_t  n_gpu_layers;                 // number of layers to store in VRAM
 #     int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
 #     float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+
+#     // ref: https://github.com/ggerganov/llama.cpp/pull/2054
+#     float    rope_freq_base;  // RoPE base frequency
+#     float    rope_freq_scale; // RoPE frequency scaling factor
+
 #     // called with a progress value between 0 and 1, pass NULL to disable
 #     llama_progress_callback progress_callback;
 #     // context pointer passed to the progress callback
 #     void * progress_callback_user_data;
 
-
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
 #     bool low_vram;   // if true, reduce VRAM usage at the cost of performance
 #     bool f16_kv;     // use fp16 for KV cache
@@ -190,6 +194,8 @@ class llama_context_params(Structure):
         ("n_gpu_layers", c_int32),
         ("main_gpu", c_int32),
         ("tensor_split", c_float * LLAMA_MAX_DEVICES.value),
+        ("rope_freq_base", c_float),
+        ("rope_freq_scale", c_float),
         ("progress_callback", llama_progress_callback),
         ("progress_callback_user_data", c_void_p),
         ("low_vram", c_bool),
@@ -328,13 +334,23 @@ def llama_mlock_supported() -> bool:
 # // Initialize the llama + ggml backend
 # // If numa is true, use NUMA optimizations
 # // Call once at the start of the program
-# LLAMA_API void llama_init_backend(bool numa);
-def llama_init_backend(numa: c_bool):
-    return _lib.llama_init_backend(numa)
+# LLAMA_API void llama_backend_init(bool numa);
+def llama_backend_init(numa: c_bool):
+    return _lib.llama_backend_init(numa)
+
+
+_lib.llama_backend_init.argtypes = [c_bool]
+_lib.llama_backend_init.restype = None
+
 
+# // Call once at the end of the program - currently only used for MPI
+# LLAMA_API void llama_backend_free();
+def llama_backend_free():
+    return _lib.llama_backend_free()
 
-_lib.llama_init_backend.argtypes = [c_bool]
-_lib.llama_init_backend.restype = None
+
+_lib.llama_backend_free.argtypes = []
+_lib.llama_backend_free.restype = None
 
 
 # LLAMA_API struct llama_model * llama_load_model_from_file(
@@ -648,6 +664,22 @@ def llama_tokenize(
 _lib.llama_tokenize.restype = c_int
 
 
+# LLAMA_API int llama_tokenize_with_model(
+#     const struct llama_model * model,
+#                     const char * text,
+#                     llama_token * tokens,
+#                             int   n_max_tokens,
+#                         bool   add_bos);
+def llama_tokenize_with_model(
+    model: llama_model_p,
+    text: bytes,
+    tokens,  # type: Array[llama_token]
+    n_max_tokens: c_int,
+    add_bos: c_bool,
+) -> int:
+    return _lib.llama_tokenize_with_model(model, text, tokens, n_max_tokens, add_bos)
+
+
 # LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
 def llama_n_vocab(ctx: llama_context_p) -> int:
     return _lib.llama_n_vocab(ctx)
@@ -675,6 +707,33 @@ def llama_n_embd(ctx: llama_context_p) -> int:
 _lib.llama_n_embd.restype = c_int
 
 
+# LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
+def llama_n_vocab_from_model(model: llama_model_p) -> int:
+    return _lib.llama_n_vocab_from_model(model)
+
+
+_lib.llama_n_vocab_from_model.argtypes = [llama_model_p]
+_lib.llama_n_vocab_from_model.restype = c_int
+
+
+# LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
+def llama_n_ctx_from_model(model: llama_model_p) -> int:
+    return _lib.llama_n_ctx_from_model(model)
+
+
+_lib.llama_n_ctx_from_model.argtypes = [llama_model_p]
+_lib.llama_n_ctx_from_model.restype = c_int
+
+
+# LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
+def llama_n_embd_from_model(model: llama_model_p) -> int:
+    return _lib.llama_n_embd_from_model(model)
+
+
+_lib.llama_n_embd_from_model.argtypes = [llama_model_p]
+_lib.llama_n_embd_from_model.restype = c_int
+
+
 # // Get the vocabulary as output parameters.
 # // Returns number of results.
 # LLAMA_API int llama_get_vocab(
@@ -695,6 +754,20 @@ def llama_get_vocab(
 _lib.llama_get_vocab.restype = c_int
 
 
+# LLAMA_API int llama_get_vocab_from_model(
+#             const struct llama_model * model,
+#                         const char * * strings,
+#                                 float * scores,
+#                                 int   capacity);
+def llama_get_vocab_from_model(
+    model: llama_model_p,
+    strings,  # type: Array[c_char_p] # type: ignore
+    scores,  # type: Array[c_float] # type: ignore
+    capacity: c_int,
+) -> int:
+    return _lib.llama_get_vocab_from_model(model, strings, scores, capacity)
+
+
 # Token logits obtained from the last call to llama_eval()
 # The logits for the last token are stored in the last row
 # Can be mutated in order to change the probabilities of the next token
@@ -724,15 +797,28 @@ def llama_get_embeddings(
 _lib.llama_get_embeddings.restype = c_float_p
 
 
-# Token Id -> String. Uses the vocabulary in the provided context
-# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
+# // Token Id -> String. Uses the vocabulary in the provided context
+# LLAMA_API const char * llama_token_to_str(
+#         const struct llama_context * ctx,
+#                         llama_token   token);
 def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:
     return _lib.llama_token_to_str(ctx, token)
 
 
 _lib.llama_token_to_str.argtypes = [llama_context_p, llama_token]
 _lib.llama_token_to_str.restype = c_char_p
 
+
+# LLAMA_API const char * llama_token_to_str_with_model(
+#             const struct llama_model * model,
+#                         llama_token   token);
+def llama_token_to_str_with_model(model: llama_model_p, token: llama_token) -> bytes:
+    return _lib.llama_token_to_str_with_model(model, token)
+
+
+_lib.llama_token_to_str_with_model.argtypes = [llama_model_p, llama_token]
+_lib.llama_token_to_str_with_model.restype = c_char_p
+
 # Special tokens
 
 
@@ -821,6 +907,39 @@ def llama_sample_frequency_and_presence_penalties(
 _lib.llama_sample_frequency_and_presence_penalties.restype = None
 
 
+# /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
+# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
+# /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+# /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+# /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
+# LLAMA_API void llama_sample_classifier_free_guidance(
+#             struct llama_context * ctx,
+#         llama_token_data_array * candidates,
+#             struct llama_context * guidance_ctx,
+#                             float   scale,
+#                             float   smooth_factor);
+def llama_sample_classifier_free_guidance(
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
+    guidance_ctx: llama_context_p,
+    scale: c_float,
+    smooth_factor: c_float,
+):
+    return _lib.llama_sample_classifier_free_guidance(
+        ctx, candidates, guidance_ctx, scale, smooth_factor
+    )
+
+
+_lib.llama_sample_classifier_free_guidance.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    llama_context_p,
+    c_float,
+    c_float,
+]
+_lib.llama_sample_classifier_free_guidance.restype = None
+
+
 # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
 # LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
 def llama_sample_softmax(
@@ -1065,5 +1184,5 @@ def llama_print_system_info() -> bytes:
 _llama_initialized = False
 
 if not _llama_initialized:
-    llama_init_backend(c_bool(False))
+    llama_backend_init(c_bool(False))
     _llama_initialized = True