Add support for stream parameter. Closes #1

abetlen · abetlen · commit 3dbb3fd3f6b2 · 2023-03-28T04:03:57.000-04:00
diff --git a/examples/high_level_api_streaming.py b/examples/high_level_api_streaming.py
@@ -0,0 +1,20 @@
+import json
+import argparse
+
+from llama_cpp import Llama
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-m", "--model", type=str, default=".//models/...")
+args = parser.parse_args()
+
+llm = Llama(model_path=args.model)
+
+stream = llm(
+    "Question: What are the names of the planets in the solar system? Answer: ",
+    max_tokens=48,
+    stop=["Q:", "\n"],
+    stream=True,
+)
+
+for output in stream:
+    print(json.dumps(output, indent=2))
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -88,7 +88,7 @@ def tokenize(self, text: bytes) -> List[int]:
             True,
         )
         if n_tokens < 0:
-            raise RuntimeError(f"Failed to tokenize: text=\"{text}\" n_tokens={n_tokens}")
+            raise RuntimeError(f'Failed to tokenize: text="{text}" n_tokens={n_tokens}')
         return list(tokens[:n_tokens])
 
     def detokenize(self, tokens: List[int]) -> bytes:
@@ -105,7 +105,6 @@ def detokenize(self, tokens: List[int]) -> bytes:
             output += llama_cpp.llama_token_to_str(self.ctx, token)
         return output
 
-
     def _eval(self, tokens: List[int], n_past):
         rc = llama_cpp.llama_eval(
             self.ctx,
@@ -137,12 +136,12 @@ def _generate(self, past_tokens, max_tokens, top_p, top_k, temp, repeat_penalty)
                 top_p=top_p,
                 top_k=top_k,
                 temp=temp,
-                repeat_penalty=repeat_penalty
+                repeat_penalty=repeat_penalty,
             )
             yield token
             self._eval([token], len(past_tokens) + i)
 
-    def __call__(
+    def _call(
         self,
         prompt: str,
         suffix: Optional[str] = None,
@@ -154,34 +153,11 @@ def __call__(
         stop: List[str] = [],
         repeat_penalty: float = 1.1,
         top_k: int = 40,
+        stream: bool = False,
     ):
-        """Generate text from a prompt.
-
-        Args:
-            prompt: The prompt to generate text from.
-            suffix: A suffix to append to the generated text. If None, no suffix is appended.
-            max_tokens: The maximum number of tokens to generate.
-            temperature: The temperature to use for sampling.
-            top_p: The top-p value to use for sampling.
-            logprobs: The number of logprobs to return. If None, no logprobs are returned.
-            echo: Whether to echo the prompt.
-            stop: A list of strings to stop generation when encountered.
-            repeat_penalty: The penalty to apply to repeated tokens.
-            top_k: The top-k value to use for sampling.
-
-        Raises:
-            ValueError: If the requested tokens exceed the context window.
-            RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
-
-        Returns:
-            Response object containing the generated text.
-        """
         completion_id = f"cmpl-{str(uuid.uuid4())}"
-        created= int(time.time())
-        text = b""
+        created = int(time.time())
         completion_tokens = []
-        last_n_tokens = deque([0] * self.last_n, maxlen=self.last_n)
-
         prompt_tokens = self.tokenize(prompt.encode("utf-8"))
 
         if len(prompt_tokens) + max_tokens > llama_cpp.llama_n_ctx(self.ctx):
@@ -198,24 +174,71 @@ def __call__(
             stop = [s.encode("utf-8") for s in stop]
 
         finish_reason = None
-        for token in self._generate(prompt_tokens, max_tokens, top_p, top_k, temperature, repeat_penalty):
+        for token in self._generate(
+            prompt_tokens, max_tokens, top_p, top_k, temperature, repeat_penalty
+        ):
             if token == llama_cpp.llama_token_eos():
                 finish_reason = "stop"
                 break
-            text += self.detokenize([token])
-            last_n_tokens.append(token)
             completion_tokens.append(token)
 
+            text = self.detokenize(completion_tokens)
             any_stop = [s for s in stop if s in text]
             if len(any_stop) > 0:
                 first_stop = any_stop[0]
                 text = text[: text.index(first_stop)]
                 finish_reason = "stop"
                 break
 
+            if stream:
+                start = len(self.detokenize(completion_tokens[:-1]))
+                longest = 0
+                for s in stop:
+                    for i in range(len(s), 0, -1):
+                        if s[-i:] == text[-i:]:
+                            if i > longest:
+                                longest = i
+                            break
+                yield {
+                    "id": completion_id,
+                    "object": "text_completion",
+                    "created": created,
+                    "model": self.model_path,
+                    "choices": [
+                        {
+                            "text": text[start : len(text) - longest].decode("utf-8"),
+                            "index": 0,
+                            "logprobs": None,
+                            "finish_reason": None,
+                        }
+                    ],
+                }
+
         if finish_reason is None:
             finish_reason = "length"
 
+        if stream:
+            if finish_reason == "stop":
+                start = len(self.detokenize(completion_tokens[:-1]))
+                text = text[start:].decode("utf-8")
+            else:
+                text = ""
+            yield {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": created,
+                "model": self.model_path,
+                "choices": [
+                    {
+                        "text": text,
+                        "index": 0,
+                        "logprobs": None,
+                        "finish_reason": finish_reason,
+                    }
+                ],
+            }
+            return
+
         text = text.decode("utf-8")
 
         if echo:
@@ -229,7 +252,7 @@ def __call__(
                 self.ctx,
             )[:logprobs]
 
-        return {
+        yield {
             "id": completion_id,
             "object": "text_completion",
             "created": created,
@@ -249,5 +272,58 @@ def __call__(
             },
         }
 
+    def __call__(
+        self,
+        prompt: str,
+        suffix: Optional[str] = None,
+        max_tokens: int = 16,
+        temperature: float = 0.8,
+        top_p: float = 0.95,
+        logprobs: Optional[int] = None,
+        echo: bool = False,
+        stop: List[str] = [],
+        repeat_penalty: float = 1.1,
+        top_k: int = 40,
+        stream: bool = False,
+    ):
+        """Generate text from a prompt.
+
+        Args:
+            prompt: The prompt to generate text from.
+            suffix: A suffix to append to the generated text. If None, no suffix is appended.
+            max_tokens: The maximum number of tokens to generate.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for sampling.
+            logprobs: The number of logprobs to return. If None, no logprobs are returned.
+            echo: Whether to echo the prompt.
+            stop: A list of strings to stop generation when encountered.
+            repeat_penalty: The penalty to apply to repeated tokens.
+            top_k: The top-k value to use for sampling.
+            stream: Whether to stream the results.
+
+        Raises:
+            ValueError: If the requested tokens exceed the context window.
+            RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
+
+        Returns:
+            Response object containing the generated text.
+        """
+        call = self._call(
+            prompt=prompt,
+            suffix=suffix,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            logprobs=logprobs,
+            echo=echo,
+            stop=stop,
+            repeat_penalty=repeat_penalty,
+            top_k=top_k,
+            stream=stream,
+        )
+        if stream:
+            return call
+        return next(call)
+
     def __del__(self):
         llama_cpp.llama_free(self.ctx)