4141 CreateEmbeddingRequest ,
4242 CreateChatCompletionRequest ,
4343 ModelList ,
44+ TokenizeInputRequest ,
45+ TokenizeInputResponse ,
46+ TokenizeInputCountResponse ,
47+ DetokenizeInputRequest ,
48+ DetokenizeInputResponse ,
4449)
4550from llama_cpp .server .errors import RouteErrorHandler
4651
@@ -196,6 +201,9 @@ async def authenticate(
196201 )
197202
198203
204+ openai_v1_tag = "OpenAI V1"
205+
206+
199207@router .post (
200208 "/v1/completions" ,
201209 summary = "Completion" ,
@@ -227,11 +235,13 @@ async def authenticate(
227235 },
228236 }
229237 },
238+ tags = [openai_v1_tag ],
230239)
231240@router .post (
232241 "/v1/engines/copilot-codex/completions" ,
233242 include_in_schema = False ,
234243 dependencies = [Depends (authenticate )],
244+ tags = [openai_v1_tag ],
235245)
236246async def create_completion (
237247 request : Request ,
@@ -297,7 +307,10 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
297307
298308
299309@router .post (
300- "/v1/embeddings" , summary = "Embedding" , dependencies = [Depends (authenticate )]
310+ "/v1/embeddings" ,
311+ summary = "Embedding" ,
312+ dependencies = [Depends (authenticate )],
313+ tags = [openai_v1_tag ],
301314)
302315async def create_embedding (
303316 request : CreateEmbeddingRequest ,
@@ -339,6 +352,7 @@ async def create_embedding(
339352 },
340353 }
341354 },
355+ tags = [openai_v1_tag ],
342356)
343357async def create_chat_completion (
344358 request : Request ,
@@ -391,7 +405,12 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]:
391405 return iterator_or_completion
392406
393407
394- @router .get ("/v1/models" , summary = "Models" , dependencies = [Depends (authenticate )])
408+ @router .get (
409+ "/v1/models" ,
410+ summary = "Models" ,
411+ dependencies = [Depends (authenticate )],
412+ tags = [openai_v1_tag ],
413+ )
395414async def get_models (
396415 llama_proxy : LlamaProxy = Depends (get_llama_proxy ),
397416) -> ModelList :
@@ -407,3 +426,51 @@ async def get_models(
407426 for model_alias in llama_proxy
408427 ],
409428 }
429+
430+
431+ extras_tag = "Extras"
432+
433+
434+ @router .post (
435+ "/extras/tokenize" ,
436+ summary = "Tokenize" ,
437+ dependencies = [Depends (authenticate )],
438+ tags = [extras_tag ],
439+ )
440+ async def tokenize (
441+ body : TokenizeInputRequest ,
442+ llama_proxy : LlamaProxy = Depends (get_llama_proxy ),
443+ ) -> TokenizeInputResponse :
444+ tokens = llama_proxy (body .model ).tokenize (body .input .encode ("utf-8" ), special = True )
445+
446+ return {"tokens" : tokens }
447+
448+
449+ @router .post (
450+ "/extras/tokenize/count" ,
451+ summary = "Tokenize Count" ,
452+ dependencies = [Depends (authenticate )],
453+ tags = [extras_tag ],
454+ )
455+ async def count_query_tokens (
456+ body : TokenizeInputRequest ,
457+ llama_proxy : LlamaProxy = Depends (get_llama_proxy ),
458+ ) -> TokenizeInputCountResponse :
459+ tokens = llama_proxy (body .model ).tokenize (body .input .encode ("utf-8" ), special = True )
460+
461+ return {"count" : len (tokens )}
462+
463+
464+ @router .post (
465+ "/extras/detokenize" ,
466+ summary = "Detokenize" ,
467+ dependencies = [Depends (authenticate )],
468+ tags = [extras_tag ],
469+ )
470+ async def detokenize (
471+ body : DetokenizeInputRequest ,
472+ llama_proxy : LlamaProxy = Depends (get_llama_proxy ),
473+ ) -> DetokenizeInputResponse :
474+ text = llama_proxy (body .model ).detokenize (body .tokens ).decode ("utf-8" )
475+
476+ return {"text" : text }
0 commit comments