Merge branch 'main' of github.com:abetlen/llama_cpp_python into main

abetlen · abetlen · commit db6ceb67acd5 · 2023-09-12T16:09:15.000-04:00
diff --git a/README.md b/README.md
@@ -187,7 +187,8 @@ Below is a short example demonstrating how to use the low-level API to tokenize
 >>> import ctypes
 >>> params = llama_cpp.llama_context_default_params()
 # use bytes for char * params
->>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params)
+>>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/ggml-model.bin", params)
+>>> ctx = llama_cpp.llama_new_context_with_model(model, params)
 >>> max_tokens = params.n_ctx
 # use ctypes arrays for array params
 >>> tokens = (llama_cpp.llama_token * int(max_tokens))()
diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -24,6 +24,10 @@ class LLaMAInteract:
 	def __init__(self, params: GptParams) -> None:
 		# input args
 		self.params = params
+		if self.params.path_session is None:
+			self.params.path_session = ""
+		if self.params.antiprompt is None:
+			self.params.antiprompt = ""
 
 		if (self.params.perplexity):
 			raise NotImplementedError("""************
@@ -66,7 +70,9 @@ def __init__(self, params: GptParams) -> None:
 		self.lparams.use_mlock = self.params.use_mlock
 		self.lparams.use_mmap = self.params.use_mmap
 
-		self.ctx = llama_cpp.llama_init_from_file(self.params.model.encode("utf8"), self.lparams)
+		self.model = llama_cpp.llama_load_model_from_file(
+			self.params.model.encode("utf8"), self.lparams)
+		self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.lparams)
 		if (not self.ctx):
 			raise RuntimeError(f"error: failed to load model '{self.params.model}'")
 
@@ -181,12 +187,12 @@ def __init__(self, params: GptParams) -> None:
 number of tokens in prompt = {len(self.embd_inp)}""", file=sys.stderr)
 
 			for i in range(len(self.embd_inp)):
-				print(f"{self.embd_inp[i]} -> '{llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i])}'", file=sys.stderr)
+				print(f"{self.embd_inp[i]} -> '{self.token_to_str(self.embd_inp[i])}'", file=sys.stderr)
 
 			if (self.params.n_keep > 0):
 				print("static prompt based on n_keep: '")
 				for i in range(self.params.n_keep):
-					print(llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i]), file=sys.stderr)
+					print(self.token_to_str(self.embd_inp[i]), file=sys.stderr)
 				print("'", file=sys.stderr)
 			print(file=sys.stderr)
 
@@ -339,7 +345,7 @@ def generate(self):
 				candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False))
 
 				# Apply penalties
-				nl_logit = logits[llama_cpp.llama_token_nl()]
+				nl_logit = logits[llama_cpp.llama_token_nl(self.ctx)]
 				last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)
 
 				_arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:])
@@ -380,7 +386,7 @@ def generate(self):
 				self.last_n_tokens.append(id)
 
 				# replace end of text token with newline token when in interactive mode
-				if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct):
+				if (id == llama_cpp.llama_token_eos(self.ctx) and self.params.interactive and not self.params.instruct):
 					id = self.llama_token_newline[0]
 					self.embd.append(id)
 					if (self.use_antiprompt()):
@@ -437,7 +443,7 @@ def generate(self):
 					break
 
 			# end of text token
-			if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos():
+			if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos(self.ctx):
 				if (not self.params.instruct):
 					for i in self.llama_token_eot:
 						yield i
@@ -464,10 +470,18 @@ def exit(self):
 		llama_cpp.llama_free(self.ctx)
 		self.set_color(util.CONSOLE_COLOR_DEFAULT)
 
+	def token_to_str(self, token_id: int) -> bytes:
+		size = 32
+		buffer = (ctypes.c_char * size)()
+		n = llama_cpp.llama_token_to_piece_with_model(
+			self.model, llama_cpp.llama_token(token_id), buffer, size)
+		assert n <= size
+		return bytes(buffer[:n])
+
 	# return past text
 	def past(self):
 		for id in self.last_n_tokens[-self.n_past:]:
-			yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf8", errors="ignore")
+			yield self.token_to_str(id).decode("utf8", errors="ignore")
 
 	# write input
 	def input(self, prompt: str):
@@ -481,7 +495,7 @@ def input(self, prompt: str):
 	def output(self):
 		self.remaining_tokens = self.params.n_predict
 		for id in self.generate():
-			cur_char = llama_cpp.llama_token_to_str(self.ctx, id)
+			cur_char = self.token_to_str(id)
 
 			# Add remainder of missing bytes
 			if None in self.multibyte_fix:
diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py
@@ -1,15 +1,17 @@
-import llama_cpp
-
+import ctypes
+import os
 import multiprocessing
 
 import llama_cpp
 
 N_THREADS = multiprocessing.cpu_count()
+MODEL_PATH = os.environ.get('MODEL', "../models/7B/ggml-model.bin")
 
 prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n"
 
 lparams = llama_cpp.llama_context_default_params()
-ctx = llama_cpp.llama_init_from_file(b"../models/7B/ggml-model.bin", lparams)
+model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode('utf-8'), lparams)
+ctx = llama_cpp.llama_new_context_with_model(model, lparams)
 
 # determine the required inference memory per token:
 tmp = [0, 1, 2, 3]
@@ -58,7 +60,8 @@
             llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
             for token_id in range(n_vocab)
         ])
-        candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False))
+        candidates_p = llama_cpp.ctypes.pointer(
+            llama_cpp.llama_token_data_array(_arr, len(_arr), False))
 
         _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data)
         llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p,
@@ -68,9 +71,9 @@
             _arr,
             last_n_repeat, frequency_penalty, presence_penalty)
 
-        llama_cpp.llama_sample_top_k(ctx, candidates_p, 40)
-        llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8)
-        llama_cpp.llama_sample_temperature(ctx, candidates_p, 0.2)
+        llama_cpp.llama_sample_top_k(ctx, candidates_p, k=40, min_keep=1)
+        llama_cpp.llama_sample_top_p(ctx, candidates_p, p=0.8, min_keep=1)
+        llama_cpp.llama_sample_temperature(ctx, candidates_p, temp=0.2)
         id = llama_cpp.llama_sample_token(ctx, candidates_p)
 
         last_n_tokens_data = last_n_tokens_data[1:] + [id]
@@ -86,13 +89,18 @@
                 break
     if not input_noecho:
         for id in embd:
+            size = 32
+            buffer = (ctypes.c_char * size)()
+            n = llama_cpp.llama_token_to_piece_with_model(
+                model, llama_cpp.llama_token(id), buffer, size)
+            assert n <= size
             print(
-                llama_cpp.llama_token_to_str(ctx, id).decode("utf-8", errors="ignore"),
+                buffer[:n].decode('utf-8'),
                 end="",
                 flush=True,
             )
 
-    if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos():
+    if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos(ctx):
         break
 
 print()
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
@@ -1,2 +1,4 @@
 from .llama_cpp import *
 from .llama import *
+
+from .version import __version__
diff --git a/llama_cpp/version.py b/llama_cpp/version.py
@@ -0,0 +1 @@
+__version__ = "0.1.84"
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,13 +23,13 @@ sse-starlette = { version = ">=1.6.1", optional = true }
 pydantic-settings = { version = ">=2.0.1", optional = true }
 
 [tool.poetry.group.dev.dependencies]
-black = "^23.7.0"
+black = "^23.9.1"
 twine = "^4.0.2"
 mkdocs = "^1.5.2"
 mkdocstrings = {extras = ["python"], version = "^0.23.0"}
-mkdocs-material = "^9.2.8"
+mkdocs-material = "^9.3.1"
 pytest = "^7.4.2"
-httpx = "^0.24.1"
+httpx = "^0.25.0"
 scikit-build = "0.17.6"
 
 [tool.poetry.extras]
diff --git a/setup.py b/setup.py
@@ -5,12 +5,14 @@
 this_directory = Path(__file__).parent
 long_description = (this_directory / "README.md").read_text(encoding="utf-8")
 
+exec(open('llama_cpp/version.py').read())
+
 setup(
     name="llama_cpp_python",
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.84",
+    version=__version__,
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",
diff --git a/tests/test_llama.py b/tests/test_llama.py
@@ -181,3 +181,6 @@ def test_llama_server():
             }
         ],
     }
+
+def test_llama_cpp_version():
+    assert llama_cpp.__version__

Original file line number	Diff line number	Diff line change
`@@ -181,3 +181,6 @@ def test_llama_server():`
`181`	`181`	`}`
`182`	`182`	`],`
`183`	`183`	`}`
	`184`	`+`
	`185`	`+def test_llama_cpp_version():`
	`186`	`+ assert llama_cpp.__version__`