abetlen
diff --git a/‎CHANGELOG.md‎
Lines changed: 22 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 4 additions & 1 deletion b/‎README.md‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎docs/macos_install.md‎
Lines changed: 59 additions & 0 deletions b/‎docs/macos_install.md‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎llama_cpp/llama.py‎
Lines changed: 8 additions & 2 deletions b/‎llama_cpp/llama.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎llama_cpp/llama_cpp.py‎
Lines changed: 35 additions & 17 deletions b/‎llama_cpp/llama_cpp.py‎
Lines changed: 35 additions & 17 deletions
diff --git a/‎llama_cpp/server/__main__.py‎
Lines changed: 1 addition & 1 deletion b/‎llama_cpp/server/__main__.py‎
Lines changed: 1 addition & 1 deletion
@@ -7,6 +7,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.1.65]
+
+### Added
+
+- (llama.cpp) Fix struct misalignment bug
+
+## [0.1.64]
+
+### Added
+
+- (llama.cpp) Update llama.cpp
+- Fix docs for seed. Set -1 for random.
+
+## [0.1.63]
+
+### Added
+
+- (llama.cpp) Add full gpu utilisation in CUDA
+- (llama.cpp) Add get_vocab
+- (llama.cpp) Add low_vram parameter
+- (server) Add logit_bias parameter
+
 ## [0.1.62]
 
 ### Fixed
 
@@ -17,6 +17,7 @@ This package provides:
 
 Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python).
 
+
 ## Installation from PyPI (recommended)
 
 Install from PyPI (requires a c compiler):
@@ -25,7 +26,7 @@ Install from PyPI (requires a c compiler):
 pip install llama-cpp-python
 ```
 
-The above command will attempt to install the package and build build `llama.cpp` from source.
+The above command will attempt to install the package and build `llama.cpp` from source.
 This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system.
 
 If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different  compiler options, please add the following flags to ensure that the package is rebuilt correctly:
@@ -70,6 +71,8 @@ To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable befor
 CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
 
+Detailed MacOS Metal GPU install documentation is available at [docs/macos_install.md](docs/macos_install.md)
+
 ## High-level API
 
 The high-level API provides a simple managed interface through the `Llama` class.
 
@@ -0,0 +1,59 @@
+
+# llama-cpp-python - MacOS Install with Metal GPU
+
+
+**(1) Make sure you have xcode installed... at least the command line parts**
+```
+# check the path of your xcode install 
+xcode-select -p
+
+# xcode installed returns
+# /Applications/Xcode-beta.app/Contents/Developer
+
+# if xcode is missing then install it... it takes ages;
+xcode-select --install
+```
+
+**(2) Install the conda version for MacOS that supports Metal GPU**
+```
+wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
+bash Miniforge3-MacOSX-arm64.sh
+```
+
+**(3) Make a conda environment**
+```
+conda create -n llama python=3.9.16
+conda activate llama
+```
+
+**(4) Install the LATEST llama-cpp-python.. which, as of just today, happily supports MacOS Metal GPU**  
+    *(you needed xcode installed in order pip to build/compile the C++ code)*
+```
+pip uninstall llama-cpp-python -y
+CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
+pip install 'llama-cpp-python[server]'
+
+# you should now have llama-cpp-python v0.1.62 installed
+llama-cpp-python         0.1.62      
+
+```
+
+**(4) Download a v3 ggml model**
+ - **ggmlv3**
+ - file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0
+
+https://huggingface.co/TheBloke/open-llama-7b-open-instruct-GGML
+
+
+**(6) run the llama-cpp-python API server with MacOS Metal GPU support**
+```
+# config your ggml model path
+# make sure it is ggml v3
+# make sure it is q4_0
+export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]q4_0.bin
+python3 -m llama_cpp.server --model $MODEL  --n_gpu_layers 1
+```
+
+***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used*
+
+
@@ -221,6 +221,7 @@ def __init__(
         last_n_tokens_size: int = 64,
         lora_base: Optional[str] = None,
         lora_path: Optional[str] = None,
+        low_vram: bool = False,
         verbose: bool = True,
     ):
         """Load a llama.cpp model from `model_path`.
@@ -229,7 +230,7 @@ def __init__(
             model_path: Path to the model.
             n_ctx: Maximum context size.
             n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
-            seed: Random seed. 0 for random.
+            seed: Random seed. -1 for random.
             f16_kv: Use half-precision for key/value cache.
             logits_all: Return logits for all tokens, not just the last token.
             vocab_only: Only load the vocabulary no weights.
@@ -262,6 +263,7 @@ def __init__(
         self.params.use_mmap = use_mmap if lora_path is None else False
         self.params.use_mlock = use_mlock
         self.params.embedding = embedding
+        self.params.low_vram = low_vram
 
         self.last_n_tokens_size = last_n_tokens_size
         self.n_batch = min(n_ctx, n_batch)
@@ -814,7 +816,7 @@ def _create_completion(
             llama_cpp.llama_reset_timings(self.ctx)
 
         if len(prompt_tokens) > self._n_ctx:
-            raise ValueError(f"Requested tokens exceed context window of {self._n_ctx}")
+            raise ValueError(f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}")
 
         # Truncate max_tokens if requested tokens would exceed the context window
         max_tokens = (
@@ -1380,6 +1382,7 @@ def create_chat_completion(
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
         model: Optional[str] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         """Generate a chat completion from a list of messages.
 
@@ -1421,6 +1424,7 @@ def create_chat_completion(
             mirostat_tau=mirostat_tau,
             mirostat_eta=mirostat_eta,
             model=model,
+            logits_processor=logits_processor,
         )
         if stream:
             chunks: Iterator[CompletionChunk] = completion_or_chunks  # type: ignore
@@ -1447,6 +1451,7 @@ def __getstate__(self):
             use_mmap=self.params.use_mmap,
             use_mlock=self.params.use_mlock,
             embedding=self.params.embedding,
+            low_vram=self.params.low_vram,
             last_n_tokens_size=self.last_n_tokens_size,
             n_batch=self.n_batch,
             n_threads=self.n_threads,
@@ -1470,6 +1475,7 @@ def __setstate__(self, state):
             use_mmap=state["use_mmap"],
             use_mlock=state["use_mlock"],
             embedding=state["embedding"],
+            low_vram=state["low_vram"],
             n_threads=state["n_threads"],
             n_batch=state["n_batch"],
             last_n_tokens_size=state["last_n_tokens_size"],
 
@@ -150,45 +150,43 @@ class llama_token_data_array(Structure):
 
 
 # struct llama_context_params {
+#     int seed;                              // RNG seed, -1 for random
 #     int n_ctx;                             // text context
 #     int n_batch;                           // prompt processing batch size
 #     int n_gpu_layers;                      // number of layers to store in VRAM
 #     int main_gpu;                          // the GPU that is used for scratch and small tensors
 #     float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
-#     int seed;                              // RNG seed, -1 for random
+#     // called with a progress value between 0 and 1, pass NULL to disable
+#     llama_progress_callback progress_callback;
+#     // context pointer passed to the progress callback
+#     void * progress_callback_user_data;
 
+#     // Keep the booleans together to avoid misalignment during copy-by-value.
+#     bool low_vram;   // if true, reduce VRAM usage at the cost of performance
 #     bool f16_kv;     // use fp16 for KV cache
 #     bool logits_all; // the llama_eval() call computes all logits, not just the last one
 #     bool vocab_only; // only load the vocabulary, no weights
 #     bool use_mmap;   // use mmap if possible
 #     bool use_mlock;  // force system to keep model in RAM
 #     bool embedding;  // embedding mode only
-
-
-#     // called with a progress value between 0 and 1, pass NULL to disable
-#     llama_progress_callback progress_callback;
-#     // context pointer passed to the progress callback
-#     void * progress_callback_user_data;
 # };
 class llama_context_params(Structure):
     _fields_ = [
+        ("seed", c_int),
         ("n_ctx", c_int),
         ("n_batch", c_int),
         ("n_gpu_layers", c_int),
         ("main_gpu", c_int),
         ("tensor_split", c_float * LLAMA_MAX_DEVICES.value),
-        ("seed", c_int),
+        ("progress_callback", llama_progress_callback),
+        ("progress_callback_user_data", c_void_p),
+        ("low_vram", c_bool),
         ("f16_kv", c_bool),
-        (
-            "logits_all",
-            c_bool,
-        ),
+        ("logits_all", c_bool),
         ("vocab_only", c_bool),
         ("use_mmap", c_bool),
         ("use_mlock", c_bool),
         ("embedding", c_bool),
-        ("progress_callback", llama_progress_callback),
-        ("progress_callback_user_data", c_void_p),
     ]
 
 
@@ -555,6 +553,26 @@ def llama_n_embd(ctx: llama_context_p) -> int:
 _lib.llama_n_embd.restype = c_int
 
 
+# // Get the vocabulary as output parameters.
+# // Returns number of results.
+# LLAMA_API int llama_get_vocab(
+#         const struct llama_context * ctx,
+#                         const char * * strings,
+#                                 float * scores,
+#                                 int   capacity);
+def llama_get_vocab(
+    ctx: llama_context_p,
+    strings,  # type: Array[c_char_p] # type: ignore
+    scores,  # type: Array[c_float] # type: ignore
+    capacity: c_int,
+) -> int:
+    return _lib.llama_get_vocab(ctx, strings, scores, capacity)
+
+
+_lib.llama_get_vocab.argtypes = [llama_context_p, c_char_p, c_float, c_int]
+_lib.llama_get_vocab.restype = c_int
+
+
 # Token logits obtained from the last call to llama_eval()
 # The logits for the last token are stored in the last row
 # Can be mutated in order to change the probabilities of the next token
@@ -596,7 +614,7 @@ def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:
 # Special tokens
 
 
-# LLAMA_API llama_token llama_token_bos();
+# LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
 def llama_token_bos() -> int:
     return _lib.llama_token_bos()
 
@@ -605,7 +623,7 @@ def llama_token_bos() -> int:
 _lib.llama_token_bos.restype = llama_token
 
 
-# LLAMA_API llama_token llama_token_eos();
+# LLAMA_API llama_token llama_token_eos(); // end-of-sentence
 def llama_token_eos() -> int:
     return _lib.llama_token_eos()
 
@@ -614,7 +632,7 @@ def llama_token_eos() -> int:
 _lib.llama_token_eos.restype = llama_token
 
 
-# LLAMA_API llama_token llama_token_nl();
+# LLAMA_API llama_token llama_token_nl(); // next-line
 def llama_token_nl() -> int:
     return _lib.llama_token_nl()
 
 
@@ -46,5 +46,5 @@
     app = create_app(settings=settings)
 
     uvicorn.run(
-        app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000))
+        app, host=os.getenv("HOST", settings.host), port=int(os.getenv("PORT", settings.port))
     )
Original file line number	Diff line number	Diff line change
`@@ -46,5 +46,5 @@`
`46`	`46`	`app = create_app(settings=settings)`
`47`	`47`
`48`	`48`	`uvicorn.run(`
`49`		`- app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000))`
	`49`	`+ app, host=os.getenv("HOST", settings.host), port=int(os.getenv("PORT", settings.port))`
`50`	`50`	`)`