v2.4

utopic-dev · May 5, 2024 · 69ce824 · 69ce824
1 parent 8d00d4d
commit 69ce824
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 8 deletions.
diff --git a/cookbook/llms/ollama/assistant.py b/cookbook/llms/ollama/assistant.py
@@ -9,4 +9,4 @@
 )
 assistant.print_response("Share a quick healthy breakfast recipe.", markdown=True)
 print("\n-*- Metrics:")
-pprint(assistant.llm.metrics)
+pprint(assistant.llm.metrics) # type: ignore
diff --git a/phi/assistant/assistant.py b/phi/assistant/assistant.py
@@ -1501,6 +1501,7 @@ async def async_print_response(
 
  def cli_app(
  self,
+ message: Optional[str] = None,
  user: str = "User",
  emoji: str = ":sunglasses:",
  stream: bool = True,
@@ -1509,6 +1510,9 @@ def cli_app(
  ) -> None:
  from rich.prompt import Prompt
 
+ if message:
+ self.print_response(message=message, stream=stream, markdown=markdown)
+
  _exit_on = exit_on or ["exit", "quit", "bye"]
  while True:
  message = Prompt.ask(f"[bold] {emoji} {user} [/bold]")

diff --git a/phi/llm/ollama/chat.py b/phi/llm/ollama/chat.py
@@ -272,9 +272,9 @@ def response_stream(self, messages: List[Message]) -> Iterator[str]:
  yield response_content
 
  response_timer.stop()
- logger.debug(f"Number of tokens generated: {completion_tokens}")
- logger.debug(f"Time per output token: {response_timer.elapsed/completion_tokens:.4f}s")
- logger.debug(f"Throughtput: {completion_tokens/response_timer.elapsed:.4f} tokens/s")
+ logger.debug(f"Tokens generated: {completion_tokens}")
+ logger.debug(f"Time per output token: {response_timer.elapsed / completion_tokens:.4f}s")
+ logger.debug(f"Throughput: {completion_tokens / response_timer.elapsed:.4f} tokens/s")
  logger.debug(f"Time to generate response: {response_timer.elapsed:.4f}s")
 
  # -*- Create assistant message
@@ -313,17 +313,18 @@ def response_stream(self, messages: List[Message]) -> Iterator[str]:
 
  # -*- Update usage metrics
  # Add response time to metrics
- assistant_message.metrics["time"] = response_timer.elapsed
- assistant_message.metrics["time_to_first_token"] = time_to_first_token
+ assistant_message.metrics["time"] = f"{response_timer.elapsed:.4f}"
+ assistant_message.metrics["time_to_first_token"] = f"{time_to_first_token:.4f}s"
+ assistant_message.metrics["time_per_output_token"] = f"{response_timer.elapsed / completion_tokens:.4f}s"
  if "response_times" not in self.metrics:
  self.metrics["response_times"] = []
  self.metrics["response_times"].append(response_timer.elapsed)
  if "time_to_first_token" not in self.metrics:
  self.metrics["time_to_first_token"] = []
- self.metrics["time_to_first_token"].append(time_to_first_token)
+ self.metrics["time_to_first_token"].append(f"{time_to_first_token:.4f}s")
  if "tokens_per_second" not in self.metrics:
  self.metrics["tokens_per_second"] = []
- self.metrics["tokens_per_second"].append(completion_tokens / response_timer.elapsed)
+ self.metrics["tokens_per_second"].append(f"{completion_tokens / response_timer.elapsed:.4f}")
 
  # -*- Add assistant message to messages
  messages.append(assistant_message)