2727import numpy .typing as npt
2828
2929
30+ # Disable warning for model and model_alias settings
3031BaseSettings .model_config ['protected_namespaces' ] = ()
3132
3233
@@ -58,14 +59,10 @@ class Settings(BaseSettings):
5859 description = "Split layers across multiple GPUs in proportion." ,
5960 )
6061 rope_freq_base : float = Field (
61- default = 10000 , ge = 1 , description = "RoPE base frequency"
62+ default = 0.0 , description = "RoPE base frequency"
6263 )
6364 rope_freq_scale : float = Field (
64- default = 1.0 , description = "RoPE frequency scaling factor"
65- )
66- low_vram : bool = Field (
67- default = False ,
68- description = "Whether to use less VRAM. This will reduce performance." ,
65+ default = 0.0 , description = "RoPE frequency scaling factor"
6966 )
7067 mul_mat_q : bool = Field (
7168 default = True , description = "if true, use experimental mul_mat_q kernels"
@@ -106,6 +103,10 @@ class Settings(BaseSettings):
106103 default = False ,
107104 description = "Enable NUMA support." ,
108105 )
106+ chat_format : str = Field (
107+ default = "llama-2" ,
108+ description = "Chat format to use." ,
109+ )
109110 cache : bool = Field (
110111 default = False ,
111112 description = "Use a cache to reduce processing times for evaluated prompts." ,
@@ -349,7 +350,6 @@ def create_app(settings: Optional[Settings] = None):
349350 tensor_split = settings .tensor_split ,
350351 rope_freq_base = settings .rope_freq_base ,
351352 rope_freq_scale = settings .rope_freq_scale ,
352- low_vram = settings .low_vram ,
353353 mul_mat_q = settings .mul_mat_q ,
354354 f16_kv = settings .f16_kv ,
355355 logits_all = settings .logits_all ,
@@ -361,6 +361,8 @@ def create_app(settings: Optional[Settings] = None):
361361 last_n_tokens_size = settings .last_n_tokens_size ,
362362 lora_base = settings .lora_base ,
363363 lora_path = settings .lora_path ,
364+ numa = settings .numa ,
365+ chat_format = settings .chat_format ,
364366 verbose = settings .verbose ,
365367 )
366368 if settings .cache :
0 commit comments