Skip to content

Commit 877ca6d

Browse files
authored
Merge branch 'main' into fix-state-pickle
2 parents 10b0cb7 + b6f9388 commit 877ca6d

File tree

11 files changed

+234
-127
lines changed

11 files changed

+234
-127
lines changed

CHANGELOG.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.1.65]
11+
12+
### Added
13+
14+
- (llama.cpp) Fix struct misalignment bug
15+
16+
## [0.1.64]
17+
18+
### Added
19+
20+
- (llama.cpp) Update llama.cpp
21+
- Fix docs for seed. Set -1 for random.
22+
23+
## [0.1.63]
24+
25+
### Added
26+
27+
- (llama.cpp) Add full gpu utilisation in CUDA
28+
- (llama.cpp) Add get_vocab
29+
- (llama.cpp) Add low_vram parameter
30+
- (server) Add logit_bias parameter
31+
1032
## [0.1.62]
1133

1234
### Fixed

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ This package provides:
1717

1818
Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python).
1919

20+
2021
## Installation from PyPI (recommended)
2122

2223
Install from PyPI (requires a c compiler):
@@ -25,7 +26,7 @@ Install from PyPI (requires a c compiler):
2526
pip install llama-cpp-python
2627
```
2728

28-
The above command will attempt to install the package and build build `llama.cpp` from source.
29+
The above command will attempt to install the package and build `llama.cpp` from source.
2930
This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system.
3031

3132
If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different compiler options, please add the following flags to ensure that the package is rebuilt correctly:
@@ -70,6 +71,8 @@ To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable befor
7071
CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python
7172
```
7273

74+
Detailed MacOS Metal GPU install documentation is available at [docs/macos_install.md](docs/macos_install.md)
75+
7376
## High-level API
7477

7578
The high-level API provides a simple managed interface through the `Llama` class.

docs/macos_install.md

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
2+
# llama-cpp-python - MacOS Install with Metal GPU
3+
4+
5+
**(1) Make sure you have xcode installed... at least the command line parts**
6+
```
7+
# check the path of your xcode install
8+
xcode-select -p
9+
10+
# xcode installed returns
11+
# /Applications/Xcode-beta.app/Contents/Developer
12+
13+
# if xcode is missing then install it... it takes ages;
14+
xcode-select --install
15+
```
16+
17+
**(2) Install the conda version for MacOS that supports Metal GPU**
18+
```
19+
wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
20+
bash Miniforge3-MacOSX-arm64.sh
21+
```
22+
23+
**(3) Make a conda environment**
24+
```
25+
conda create -n llama python=3.9.16
26+
conda activate llama
27+
```
28+
29+
**(4) Install the LATEST llama-cpp-python.. which, as of just today, happily supports MacOS Metal GPU**
30+
*(you needed xcode installed in order pip to build/compile the C++ code)*
31+
```
32+
pip uninstall llama-cpp-python -y
33+
CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
34+
pip install 'llama-cpp-python[server]'
35+
36+
# you should now have llama-cpp-python v0.1.62 installed
37+
llama-cpp-python         0.1.62     
38+
39+
```
40+
41+
**(4) Download a v3 ggml model**
42+
- **ggmlv3**
43+
- file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0
44+
45+
https://huggingface.co/TheBloke/open-llama-7b-open-instruct-GGML
46+
47+
48+
**(6) run the llama-cpp-python API server with MacOS Metal GPU support**
49+
```
50+
# config your ggml model path
51+
# make sure it is ggml v3
52+
# make sure it is q4_0
53+
export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]q4_0.bin
54+
python3 -m llama_cpp.server --model $MODEL --n_gpu_layers 1
55+
```
56+
57+
***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used*
58+
59+

llama_cpp/llama.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ def __init__(
221221
last_n_tokens_size: int = 64,
222222
lora_base: Optional[str] = None,
223223
lora_path: Optional[str] = None,
224+
low_vram: bool = False,
224225
verbose: bool = True,
225226
):
226227
"""Load a llama.cpp model from `model_path`.
@@ -229,7 +230,7 @@ def __init__(
229230
model_path: Path to the model.
230231
n_ctx: Maximum context size.
231232
n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
232-
seed: Random seed. 0 for random.
233+
seed: Random seed. -1 for random.
233234
f16_kv: Use half-precision for key/value cache.
234235
logits_all: Return logits for all tokens, not just the last token.
235236
vocab_only: Only load the vocabulary no weights.
@@ -262,6 +263,7 @@ def __init__(
262263
self.params.use_mmap = use_mmap if lora_path is None else False
263264
self.params.use_mlock = use_mlock
264265
self.params.embedding = embedding
266+
self.params.low_vram = low_vram
265267

266268
self.last_n_tokens_size = last_n_tokens_size
267269
self.n_batch = min(n_ctx, n_batch)
@@ -814,7 +816,7 @@ def _create_completion(
814816
llama_cpp.llama_reset_timings(self.ctx)
815817

816818
if len(prompt_tokens) > self._n_ctx:
817-
raise ValueError(f"Requested tokens exceed context window of {self._n_ctx}")
819+
raise ValueError(f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}")
818820

819821
# Truncate max_tokens if requested tokens would exceed the context window
820822
max_tokens = (
@@ -1380,6 +1382,7 @@ def create_chat_completion(
13801382
mirostat_tau: float = 5.0,
13811383
mirostat_eta: float = 0.1,
13821384
model: Optional[str] = None,
1385+
logits_processor: Optional[LogitsProcessorList] = None,
13831386
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
13841387
"""Generate a chat completion from a list of messages.
13851388
@@ -1421,6 +1424,7 @@ def create_chat_completion(
14211424
mirostat_tau=mirostat_tau,
14221425
mirostat_eta=mirostat_eta,
14231426
model=model,
1427+
logits_processor=logits_processor,
14241428
)
14251429
if stream:
14261430
chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore
@@ -1447,6 +1451,7 @@ def __getstate__(self):
14471451
use_mmap=self.params.use_mmap,
14481452
use_mlock=self.params.use_mlock,
14491453
embedding=self.params.embedding,
1454+
low_vram=self.params.low_vram,
14501455
last_n_tokens_size=self.last_n_tokens_size,
14511456
n_batch=self.n_batch,
14521457
n_threads=self.n_threads,
@@ -1470,6 +1475,7 @@ def __setstate__(self, state):
14701475
use_mmap=state["use_mmap"],
14711476
use_mlock=state["use_mlock"],
14721477
embedding=state["embedding"],
1478+
low_vram=state["low_vram"],
14731479
n_threads=state["n_threads"],
14741480
n_batch=state["n_batch"],
14751481
last_n_tokens_size=state["last_n_tokens_size"],

llama_cpp/llama_cpp.py

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -150,45 +150,43 @@ class llama_token_data_array(Structure):
150150

151151

152152
# struct llama_context_params {
153+
# int seed; // RNG seed, -1 for random
153154
# int n_ctx; // text context
154155
# int n_batch; // prompt processing batch size
155156
# int n_gpu_layers; // number of layers to store in VRAM
156157
# int main_gpu; // the GPU that is used for scratch and small tensors
157158
# float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
158-
# int seed; // RNG seed, -1 for random
159+
# // called with a progress value between 0 and 1, pass NULL to disable
160+
# llama_progress_callback progress_callback;
161+
# // context pointer passed to the progress callback
162+
# void * progress_callback_user_data;
159163

164+
# // Keep the booleans together to avoid misalignment during copy-by-value.
165+
# bool low_vram; // if true, reduce VRAM usage at the cost of performance
160166
# bool f16_kv; // use fp16 for KV cache
161167
# bool logits_all; // the llama_eval() call computes all logits, not just the last one
162168
# bool vocab_only; // only load the vocabulary, no weights
163169
# bool use_mmap; // use mmap if possible
164170
# bool use_mlock; // force system to keep model in RAM
165171
# bool embedding; // embedding mode only
166-
167-
168-
# // called with a progress value between 0 and 1, pass NULL to disable
169-
# llama_progress_callback progress_callback;
170-
# // context pointer passed to the progress callback
171-
# void * progress_callback_user_data;
172172
# };
173173
class llama_context_params(Structure):
174174
_fields_ = [
175+
("seed", c_int),
175176
("n_ctx", c_int),
176177
("n_batch", c_int),
177178
("n_gpu_layers", c_int),
178179
("main_gpu", c_int),
179180
("tensor_split", c_float * LLAMA_MAX_DEVICES.value),
180-
("seed", c_int),
181+
("progress_callback", llama_progress_callback),
182+
("progress_callback_user_data", c_void_p),
183+
("low_vram", c_bool),
181184
("f16_kv", c_bool),
182-
(
183-
"logits_all",
184-
c_bool,
185-
),
185+
("logits_all", c_bool),
186186
("vocab_only", c_bool),
187187
("use_mmap", c_bool),
188188
("use_mlock", c_bool),
189189
("embedding", c_bool),
190-
("progress_callback", llama_progress_callback),
191-
("progress_callback_user_data", c_void_p),
192190
]
193191

194192

@@ -555,6 +553,26 @@ def llama_n_embd(ctx: llama_context_p) -> int:
555553
_lib.llama_n_embd.restype = c_int
556554

557555

556+
# // Get the vocabulary as output parameters.
557+
# // Returns number of results.
558+
# LLAMA_API int llama_get_vocab(
559+
# const struct llama_context * ctx,
560+
# const char * * strings,
561+
# float * scores,
562+
# int capacity);
563+
def llama_get_vocab(
564+
ctx: llama_context_p,
565+
strings, # type: Array[c_char_p] # type: ignore
566+
scores, # type: Array[c_float] # type: ignore
567+
capacity: c_int,
568+
) -> int:
569+
return _lib.llama_get_vocab(ctx, strings, scores, capacity)
570+
571+
572+
_lib.llama_get_vocab.argtypes = [llama_context_p, c_char_p, c_float, c_int]
573+
_lib.llama_get_vocab.restype = c_int
574+
575+
558576
# Token logits obtained from the last call to llama_eval()
559577
# The logits for the last token are stored in the last row
560578
# Can be mutated in order to change the probabilities of the next token
@@ -596,7 +614,7 @@ def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:
596614
# Special tokens
597615

598616

599-
# LLAMA_API llama_token llama_token_bos();
617+
# LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
600618
def llama_token_bos() -> int:
601619
return _lib.llama_token_bos()
602620

@@ -605,7 +623,7 @@ def llama_token_bos() -> int:
605623
_lib.llama_token_bos.restype = llama_token
606624

607625

608-
# LLAMA_API llama_token llama_token_eos();
626+
# LLAMA_API llama_token llama_token_eos(); // end-of-sentence
609627
def llama_token_eos() -> int:
610628
return _lib.llama_token_eos()
611629

@@ -614,7 +632,7 @@ def llama_token_eos() -> int:
614632
_lib.llama_token_eos.restype = llama_token
615633

616634

617-
# LLAMA_API llama_token llama_token_nl();
635+
# LLAMA_API llama_token llama_token_nl(); // next-line
618636
def llama_token_nl() -> int:
619637
return _lib.llama_token_nl()
620638

llama_cpp/server/__main__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,5 +46,5 @@
4646
app = create_app(settings=settings)
4747

4848
uvicorn.run(
49-
app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000))
49+
app, host=os.getenv("HOST", settings.host), port=int(os.getenv("PORT", settings.port))
5050
)

0 commit comments

Comments
 (0)