@@ -23,6 +23,7 @@ def __init__(
2323 f16_kv : bool = False ,
2424 logits_all : bool = False ,
2525 vocab_only : bool = False ,
26+ use_mmap : bool = True ,
2627 use_mlock : bool = False ,
2728 embedding : bool = False ,
2829 n_threads : Optional [int ] = None ,
@@ -40,6 +41,7 @@ def __init__(
4041 f16_kv: Use half-precision for key/value cache.
4142 logits_all: Return logits for all tokens, not just the last token.
4243 vocab_only: Only load the vocabulary no weights.
44+ use_mmap: Use mmap if possible.
4345 use_mlock: Force the system to keep the model in RAM.
4446 embedding: Embedding mode only.
4547 n_threads: Number of threads to use. If None, the number of threads is automatically determined.
@@ -63,6 +65,7 @@ def __init__(
6365 self .params .f16_kv = f16_kv
6466 self .params .logits_all = logits_all
6567 self .params .vocab_only = vocab_only
68+ self .params .use_mmap = use_mmap
6669 self .params .use_mlock = use_mlock
6770 self .params .embedding = embedding
6871
@@ -661,6 +664,7 @@ def __getstate__(self):
661664 f16_kv = self .params .f16_kv ,
662665 logits_all = self .params .logits_all ,
663666 vocab_only = self .params .vocab_only ,
667+ use_mmap = self .params .use_mmap ,
664668 use_mlock = self .params .use_mlock ,
665669 embedding = self .params .embedding ,
666670 last_n_tokens_size = self .last_n_tokens_size ,
@@ -679,6 +683,7 @@ def __setstate__(self, state):
679683 f16_kv = state ["f16_kv" ],
680684 logits_all = state ["logits_all" ],
681685 vocab_only = state ["vocab_only" ],
686+ use_mmap = state ["use_mmap" ],
682687 use_mlock = state ["use_mlock" ],
683688 embedding = state ["embedding" ],
684689 n_threads = state ["n_threads" ],
0 commit comments