Add Audio Response for LLM generated content (#50)

* Add text to speech. Parse OpenAI responses better than before
SevaSk · vivekuppal · Jun 29, 2023 · Jun 29, 2023 · Jun 29, 2023 · Jun 29, 2023
commit e48bdb813414078f615355d304789cb8dccda1f8
diff --git a/AudioRecorder.py b/AudioRecorder.py
@@ -174,9 +174,7 @@ def set_device(self, index: int):
  )
  self.source = source
  print(f'[INFO] Listening to sound from Microphone: {self.get_name()} ')
- # This line is commented because in case of non default microphone it can occasionally take
- # several minutes to execute, thus delaying the start of the application.
- # self.adjust_for_noise("Default Mic", "Please make some noise from the Default Mic...")
+ self.adjust_for_noise("Mic", "Please make some noise from the chosen Mic...")
 
 
 class SpeakerRecorder(BaseRecorder):

diff --git a/CustomPrompts.py b/CustomPrompts.py
diff --git a/GPTResponder.py b/GPTResponder.py
@@ -1,8 +1,9 @@
 import datetime
 import time
+import pprint
+import openai
 import GlobalVars
 import prompts
-import openai
 import conversation
 import constants
 import configuration
@@ -30,14 +31,12 @@ def generate_response_from_transcript_no_check(self, transcript) -> str:
  Gets a response even if the continuous suggestion option is disabled.
  """
  try:
- # prompt_content = create_prompt(transcript)
- # prompt_api_message = [{"role": "system", "content": prompt_content}]
  prompt_api_message = prompts.create_single_turn_prompt_message(transcript)
  multiturn_prompt_content = self.conversation.get_merged_conversation(
  length=constants.MAX_TRANSCRIPTION_PHRASES_FOR_LLM)
  multiturn_prompt_api_message = prompts.create_multiturn_prompt(multiturn_prompt_content)
- # print(f'Usual prompt api message: {prompt_api_message}')
- # print(f'Multiturn prompt: {multiturn_prompt_api_message}')
+ # pprint.pprint(f'Prompt api message: {prompt_api_message}')
+ # print(f'Multiturn prompt for ChatGPT: {multiturn_prompt_api_message}')
  usual_response = openai.ChatCompletion.create(
  model=self.model,
  messages=prompt_api_message,
@@ -51,13 +50,32 @@ def generate_response_from_transcript_no_check(self, transcript) -> str:
  return prompts.INITIAL_RESPONSE
 
  usual_full_response = usual_response.choices[0].message.content
+ # pprint.pprint(f'Prompt api response: {usual_response}')
  try:
- return usual_full_response.split('[')[1].split(']')[0]
+ # The original way of processing the response. It used to cause issues when there
+ # were multiple questions in the transcript.
+ # response = usual_full_response.split('[')[1].split(']')[0]
+ processed_response = self.process_response(usual_full_response)
+ self.update_conversation(persona=constants.PERSONA_ASSISTANT, response=processed_response)
+ return processed_response
  except Exception as exception:
  root_logger.error('Error parsing response from LLM.')
  root_logger.exception(exception)
  return prompts.INITIAL_RESPONSE
 
+ def process_response(self, input_str: str) -> str:
+ lines = input_str.split(sep='\n')
+ response = ''
+ for line in lines:
+ # Skip any responses that contain content like
+ # Speaker 1: <Some statement>
+ # This is generated content added by OpenAI that can be skipped
+ if 'Speaker' in line and ':' in line:
+ continue
+ response = response + line.strip().strip('[').strip(']')
+
+ return response
+
  def generate_response_from_transcript(self, transcript):
  """Ping OpenAI LLM model to get response from the Assistant
  """
@@ -67,6 +85,13 @@ def generate_response_from_transcript(self, transcript):
 
  return self.generate_response_from_transcript_no_check(transcript)
 
+ def update_conversation(self, response, persona):
+ if response != '':
+ self.response = response
+ self.conversation.update_conversation(persona=persona,
+ text=response,
+ time_spoken=datetime.datetime.now())
+
  def respond_to_transcriber(self, transcriber):
  """Thread method to continously update the transcript
  """
@@ -76,27 +101,29 @@ def respond_to_transcriber(self, transcriber):
  start_time = time.time()
 
  transcriber.transcript_changed_event.clear()
- transcript_string = transcriber.get_transcript(
- length=constants.MAX_TRANSCRIPTION_PHRASES_FOR_LLM)
- response = self.generate_response_from_transcript(transcript_string)
+ response = ''
+
+ # Do processing only if LLM transcription is enabled
+ if not self.gl_vars.freeze_state[0]:
+ transcript_string = transcriber.get_transcript(
+ length=constants.MAX_TRANSCRIPTION_PHRASES_FOR_LLM)
+ response = self.generate_response_from_transcript(transcript_string)
 
  end_time = time.time() # Measure end time
  execution_time = end_time - start_time # Calculate time to execute the function
 
- if response != '':
- self.response = response
- self.conversation.update_conversation(persona=constants.PERSONA_ASSISTANT,
- text=response,
- time_spoken=datetime.datetime.now())
-
  remaining_time = self.response_interval - execution_time
  if remaining_time > 0:
  time.sleep(remaining_time)
  else:
- time.sleep(0.3)
+ time.sleep(self.response_interval)
 
  def update_response_interval(self, interval):
  """Change the interval for pinging LLM
  """
  root_logger.info(GPTResponder.update_response_interval.__name__)
  self.response_interval = interval
+
+
+if __name__ == "__main__":
+ print('GPTResponder')
diff --git a/GlobalVars.py b/GlobalVars.py
@@ -2,6 +2,7 @@
 import tkinter as tk
 import customtkinter as ctk
 from AudioTranscriber import AudioTranscriber
+from audio_player import AudioPlayer
 import AudioRecorder
 import Singleton
 import app_logging as al
@@ -17,6 +18,7 @@ class TranscriptionGlobals(Singleton.Singleton):
  audio_queue: queue.Queue = None
  user_audio_recorder: AudioRecorder.MicRecorder = None
  speaker_audio_recorder: AudioRecorder.SpeakerRecorder = None
+ audio_player: AudioPlayer = None
  # Global for transcription from speaker, microphone
  transcriber: AudioTranscriber = None
  # Global for responses from openAI API

diff --git a/README.md b/README.md
@@ -142,6 +142,7 @@ Incorrect API key provided: API_KEY. You can find your API key at https://platfo
 This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
 
 ## ➕ Enhancements from base repository ➕
+- Speech Mode - Read out responses from ChatGPT as Audio
 - Do not need Open AI key, paid Open AI account to use the complete functionality
 - Allow users selective disabling of mic, speaker audio input
 - Allow users to add contextual information to provide customized responses to conversation

diff --git a/TranscriberModels.py b/TranscriberModels.py
@@ -73,7 +73,7 @@ def load_model(self):
 
 class APIWhisperTranscriber:
  def __init__(self):
- print('Using Open AI API for transcription.')
+ print('[INFO] Using Open AI API for transcription.')
  openai.api_key = GlobalVars.TranscriptionGlobals().api_key
  # lang parameter is not required for API invocation. This exists solely
  # to support --api option from command line.

diff --git a/audio_player.py b/audio_player.py
@@ -0,0 +1,64 @@
+"""Plays the responses received from LLM as Audio
+This class does text to speech
+"""
+
+import os
+import time
+import tempfile
+import threading
+import playsound
+import gtts
+import app_logging as al
+import conversation
+import constants
+
+
+root_logger = al.get_logger()
+
+
+class AudioPlayer:
+ """Play text to audio
+ """
+ def __init__(self, convo: conversation):
+ root_logger.info(AudioPlayer.__name__)
+ self.speech_text_available = threading.Event()
+ self.conversation = convo
+ self.temp_dir = tempfile.gettempdir()
+
+ def play_audio(self, speech: str):
+ """Play text to audio.
+ This is a blocking method and will return when audio playback is complete.
+ For large audio text, this could be several minutes.
+ """
+ root_logger.info(AudioPlayer.__name__)
+ audio_obj = gtts.gTTS(speech)
+ temp_audio_file = tempfile.mkstemp(dir=self.temp_dir, suffix='.mp3')
+ os.close(temp_audio_file[0])
+
+ audio_obj.save(temp_audio_file[1])
+ try:
+ playsound.playsound(temp_audio_file[1])
+ except playsound.PlaysoundException as play_ex:
+ print('Error when attempting to play audio.')
+ print(play_ex)
+
+ os.remove(temp_audio_file[1])
+
+ def play_audio_loop(self):
+ """Play text to audio continuously based on the signaling of event
+ """
+ while True:
+ if self.speech_text_available.is_set():
+ self.speech_text_available.clear()
+ speech = self.conversation.get_conversation(
+ sources=[constants.PERSONA_ASSISTANT], length=1)
+ # Speech text is in the format
+ # f"{persona}: [{text}]\n\n"
+ # Remove persona
+ final_speech = speech[len(constants.PERSONA_ASSISTANT)+2:]
+ # Remove whitespace
+ final_speech = final_speech.strip()
+ # Remove Square brackets
+ final_speech = final_speech[1:-1]
+ self.play_audio(speech=final_speech)
+ time.sleep(0.1)
diff --git a/constants.py b/constants.py
@@ -2,8 +2,8 @@
 """
 
 PERSONA_YOU = 'You'
-PERSONA_ASSISTANT = 'Assistant'
-PERSONA_SYSTEM = 'System'
+PERSONA_ASSISTANT = 'assistant'
+PERSONA_SYSTEM = 'system'
 PERSONA_SPEAKER = 'Speaker'
 
 LOG_NAME = 'Transcribe'

diff --git a/conversation.py b/conversation.py
@@ -1,14 +1,8 @@
 from heapq import merge
+import datetime
+# import pprint
 import constants
 import configuration
-import datetime
-
-DEFAULT_PREAMBLE = """You are a casual pal, genuinely interested in the conversation at hand.""" \
- """Please respond, in detail, to the conversation. Confidently give a """\
- """straightforward response to the speaker, even if you don't understand """\
- """them. Give your response in square brackets. DO NOT ask to repeat, """\
- """and DO NOT ask for clarification. Just answer the speaker directly."""\
- """A poor transcription of conversation is given below."""
 
 
 class Conversation:
@@ -21,9 +15,17 @@ def __init__(self):
  constants.PERSONA_YOU: [],
  constants.PERSONA_SPEAKER: [],
  constants.PERSONA_ASSISTANT: []}
- transcript = self.transcript_data[constants.PERSONA_SYSTEM]
- transcript.append((f"{constants.PERSONA_SYSTEM}: [{DEFAULT_PREAMBLE}]\n\n", datetime.datetime.now()))
  config = configuration.Config().get_data()
+ prompt = config["OpenAI"]["system_prompt"]
+ self.update_conversation(persona=constants.PERSONA_SYSTEM, text=prompt, 
+ time_spoken=datetime.datetime.now())
+ initial_convo: dict = config["OpenAI"]["initial_convo"]
+ # Read the initial conversation from parameters.yaml file and add to the convo
+ for _, value in initial_convo.items():
+ role = value['role']
+ content = value['content']
+ self.update_conversation(persona=role, text=content, 
+ time_spoken=datetime.datetime.now())
 
  def clear_conversation_data(self):
  """Clear all conversation data
@@ -47,12 +49,12 @@ def update_conversation(self, persona: str, text: str, time_spoken, pop: bool =
 
  def get_conversation(self,
  sources: list = None,
- length: int = 0):
- """Get the complete transcript
+ length: int = 0) -> list:
+ """Get the transcript based on specified sources
  Args:
  sources: Get data from which sources (You, Speaker, Assistant, System)
  length: Get the last length elements from the audio transcript.
- Default value = 0, gives the complete transcript
+ Default value = 0, gives the complete transcript for chosen sources
  """
  if sources is None:
  sources = [constants.PERSONA_YOU,
@@ -72,19 +74,21 @@ def get_conversation(self,
  def get_merged_conversation(self, length: int = 0) -> list:
  """Creates a prompt to be sent to LLM (OpenAI by default)
  length: Get the last length elements from the audio transcript.
+ Initial system prompt is always part of the return value
  Default value = 0, gives the complete transcript
  """
  # print(f'You: Length: {len(self.transcript_data[constants.PERSONA_YOU])}')
  # print(f'Speaker: Length: {len(self.transcript_data[constants.PERSONA_SPEAKER])}')
  # print(f'Assistant: Length: {len(self.transcript_data[constants.PERSONA_ASSISTANT])}')
  # print(f'System: Length: {len(self.transcript_data[constants.PERSONA_SYSTEM])}')
 
+ combined_transcript = self.transcript_data[constants.PERSONA_SYSTEM]
  combined_transcript = list(merge(
+ combined_transcript,
  self.transcript_data[constants.PERSONA_YOU][-length:],
  self.transcript_data[constants.PERSONA_SPEAKER][-length:],
  self.transcript_data[constants.PERSONA_ASSISTANT][-length:],
  key=lambda x: x[1]))
  combined_transcript = combined_transcript[-length:]
 
- combined_transcript.insert(0, (f"{constants.PERSONA_SYSTEM}: [{self.transcript_data[constants.PERSONA_SYSTEM][0]}]\n\n", datetime.datetime.now()))
  return combined_transcript