Multiturn prompts, bug fixes (#55)

* multiturn prompts, bug fixes
SevaSk · vivekuppal · Jun 29, 2023 · Jun 29, 2023 · Jun 29, 2023 · Jun 29, 2023
commit fa55416558dfe23860df2a8e272fa0ec59bbc3ce
diff --git a/AudioTranscriber.py b/AudioTranscriber.py
@@ -1,21 +1,25 @@
 import os
 import queue
+from heapq import merge
 import threading
 import io
 from datetime import timedelta
+import pprint
 import wave
 import tempfile
 import custom_speech_recognition as sr
 import pyaudiowpatch as pyaudio
-from heapq import merge
 import conversation
+import constants
 
 
 PHRASE_TIMEOUT = 3.05
 
 
 class AudioTranscriber:
  def __init__(self, mic_source, speaker_source, model, convo: conversation.Conversation):
+ # Transcript_data should be replaced with the conversation object.
+ # We do not need to store transcription in 2 different places.
  self.transcript_data = {"You": [], "Speaker": []}
  self.transcript_changed_event = threading.Event()
  self.audio_model = model
@@ -105,7 +109,7 @@ def update_transcript(self, who_spoke, text, time_spoken):
  """Update transcript with new data
  Args:
  who_spoke: Person this audio is attributed to
- text: Actual spken words
+ text: Actual spoken words
  time_spoken: Time at which audio was taken, relative to start time
  """
  source_info = self.audio_sources[who_spoke]
@@ -129,11 +133,24 @@ def get_transcript(self, length: int = 0):
  length: Get the last length elements from the audio transcript.
  Default value = 0, gives the complete transcript
  """
+ # This data should be retrieved from the conversation object.
  combined_transcript = list(merge(
  self.transcript_data["You"], self.transcript_data["Speaker"],
  key=lambda x: x[1], reverse=False))
  combined_transcript = combined_transcript[-length:]
- return "".join([t[0] for t in combined_transcript])
+ current_return_val = "".join([t[0] for t in combined_transcript])
+ sources = [
+ constants.PERSONA_YOU,
+ constants.PERSONA_SPEAKER
+ ]
+ convo_object_return_value = self.conversation.get_conversation(sources=sources)
+ # print('---------- AudioTranscriber.py get_transcript convo object----------')
+ # pprint.pprint(convo_object_return_value, width=120)
+
+ # print('---------- AudioTranscriber.py get_transcript current implementation----------')
+ # pprint.pprint(current_return_val, width=120)
+
+ return convo_object_return_value
 
  def clear_transcript_data(self):
  """

diff --git a/GPTResponder.py b/GPTResponder.py
@@ -29,6 +29,7 @@ def __init__(self, convo: conversation.Conversation):
  def generate_response_from_transcript_no_check(self, transcript) -> str:
  """Ping LLM to get a suggested response right away.
  Gets a response even if the continuous suggestion option is disabled.
+ Updates the conversation object with the response from LLM.
  """
  try:
  prompt_api_message = prompts.create_single_turn_prompt_message(transcript)
@@ -42,28 +43,49 @@ def generate_response_from_transcript_no_check(self, transcript) -> str:
  messages=prompt_api_message,
  temperature=0.0
  )
+ # Multi turn response is only effective when continuous mode is off.
+ # In continuous mode, there are far too many responses from LLM,
+ # they confuse the LLM if that many responses are replayed back to LLM.
+ multi_turn_response = openai.ChatCompletion.create(
+ model=self.model,
+ messages=multiturn_prompt_api_message,
+ temperature=0.0
+ )
+
+ # print('-------- Single Turn --------')
+ # pprint.pprint(f'message={prompt_api_message}', width=120)
+ # pprint.pprint(f'response={usual_response}', width=120)
+ # print('-------- Multi Turn --------')
+ # pprint.pprint(f'message={multiturn_prompt_api_message}', width=120)
+ # pprint.pprint(f'response={multi_turn_response}', width=120)
+ # print('-------- -------- -------- -------- -------- --------')
 
  except Exception as exception:
  print(exception)
  root_logger.error('Error when attempting to get a response from LLM.')
  root_logger.exception(exception)
  return prompts.INITIAL_RESPONSE
 
- usual_full_response = usual_response.choices[0].message.content
+ # single_turn_response_content = usual_response.choices[0].message.content
+ multi_turn_response_content = multi_turn_response.choices[0].message.content
  # pprint.pprint(f'Prompt api response: {usual_response}')
  try:
- # The original way of processing the response. It used to cause issues when there
- # were multiple questions in the transcript.
- # response = usual_full_response.split('[')[1].split(']')[0]
- processed_response = self.process_response(usual_full_response)
- self.update_conversation(persona=constants.PERSONA_ASSISTANT, response=processed_response)
- return processed_response
+ # The original way of processing the response.
+ # It causes issues when there are multiple questions in the transcript.
+ # response = single_turn_response_content.split('[')[1].split(']')[0]
+ # processed_single_turn_response = self.process_response(single_turn_response_content)
+ processed_multi_turn_response = self.process_response(multi_turn_response_content)
+ self.update_conversation(persona=constants.PERSONA_ASSISTANT,
+ response=processed_multi_turn_response)
+ return processed_multi_turn_response
  except Exception as exception:
  root_logger.error('Error parsing response from LLM.')
  root_logger.exception(exception)
  return prompts.INITIAL_RESPONSE
 
  def process_response(self, input_str: str) -> str:
+ """ Extract relevant data from LLM response.
+ """
  lines = input_str.split(sep='\n')
  response = ''
  for line in lines:

diff --git a/GlobalVars.py b/GlobalVars.py
@@ -6,6 +6,7 @@
 import AudioRecorder
 import Singleton
 import app_logging as al
+import conversation
 
 
 root_logger = al.get_logger()
@@ -30,6 +31,8 @@ class TranscriptionGlobals(Singleton.Singleton):
  filemenu: tk.Menu = None
  response_textbox: ctk.CTkTextbox = None
 
+ convo: conversation.Conversation = None
+
  def __init__(self, key: str = 'API_KEY'):
  root_logger.info(TranscriptionGlobals.__name__)
  if self.audio_queue is None:

diff --git a/audio_player.py b/audio_player.py
@@ -45,7 +45,7 @@ def play_audio(self, speech: str):
  os.remove(temp_audio_file[1])
 
  def play_audio_loop(self):
- """Play text to audio continuously based on the signaling of event
+ """Play text to audio based on signaling of event
  """
  while True:
  if self.speech_text_available.is_set():

diff --git a/constants.py b/constants.py
@@ -8,3 +8,4 @@
 
 LOG_NAME = 'Transcribe'
 MAX_TRANSCRIPTION_PHRASES_FOR_LLM = 20
+TRANSCRIPT_UI_UPDATE_DELAY_DURATION_MS = 500
diff --git a/conversation.py b/conversation.py
@@ -17,15 +17,16 @@ def __init__(self):
  constants.PERSONA_ASSISTANT: []}
  config = configuration.Config().get_data()
  prompt = config["OpenAI"]["system_prompt"]
- self.update_conversation(persona=constants.PERSONA_SYSTEM, text=prompt, 
+ self.update_conversation(persona=constants.PERSONA_SYSTEM, text=prompt,
  time_spoken=datetime.datetime.now())
  initial_convo: dict = config["OpenAI"]["initial_convo"]
  # Read the initial conversation from parameters.yaml file and add to the convo
  for _, value in initial_convo.items():
  role = value['role']
  content = value['content']
- self.update_conversation(persona=role, text=content, 
+ self.update_conversation(persona=role, text=content,
  time_spoken=datetime.datetime.now())
+ self.last_update: datetime.datetime = datetime.datetime.now()
 
  def clear_conversation_data(self):
  """Clear all conversation data
@@ -34,6 +35,7 @@ def clear_conversation_data(self):
  self.transcript_data[constants.PERSONA_SPEAKER].clear()
  self.transcript_data[constants.PERSONA_SYSTEM].clear()
  self.transcript_data[constants.PERSONA_ASSISTANT].clear()
+ self.last_update = datetime.datetime.now()
 
  def update_conversation(self, persona: str, text: str, time_spoken, pop: bool = False):
  """Update conversation with new data
@@ -46,15 +48,18 @@ def update_conversation(self, persona: str, text: str, time_spoken, pop: bool =
  if pop:
  transcript.pop()
  transcript.append((f"{persona}: [{text}]\n\n", time_spoken))
+ self.last_update = datetime.datetime.now()
 
  def get_conversation(self,
  sources: list = None,
- length: int = 0) -> list:
+ length: int = 0,
+ reverse: bool = False) -> list:
  """Get the transcript based on specified sources
  Args:
  sources: Get data from which sources (You, Speaker, Assistant, System)
  length: Get the last length elements from the audio transcript.
  Default value = 0, gives the complete transcript for chosen sources
+ reverse: reverse the sort order or keep it in chronological order
  """
  if sources is None:
  sources = [constants.PERSONA_YOU,
@@ -67,11 +72,11 @@ def get_conversation(self,
  self.transcript_data[constants.PERSONA_SPEAKER][-length:] if constants.PERSONA_SPEAKER in sources else [],
  self.transcript_data[constants.PERSONA_ASSISTANT][-length:] if constants.PERSONA_ASSISTANT in sources else [],
  self.transcript_data[constants.PERSONA_SYSTEM][-length:] if constants.PERSONA_SYSTEM in sources else [],
- key=lambda x: x[1]))
+ key=lambda x: x[1], reverse=reverse))
  combined_transcript = combined_transcript[-length:]
  return "".join([t[0] for t in combined_transcript])
 
- def get_merged_conversation(self, length: int = 0) -> list:
+ def get_merged_conversation(self, length: int = 0, reverse: bool = False) -> list:
  """Creates a prompt to be sent to LLM (OpenAI by default)
  length: Get the last length elements from the audio transcript.
  Initial system prompt is always part of the return value
@@ -88,7 +93,7 @@ def get_merged_conversation(self, length: int = 0) -> list:
  self.transcript_data[constants.PERSONA_YOU][-length:],
  self.transcript_data[constants.PERSONA_SPEAKER][-length:],
  self.transcript_data[constants.PERSONA_ASSISTANT][-length:],
- key=lambda x: x[1]))
+ key=lambda x: x[1], reverse=reverse))
  combined_transcript = combined_transcript[-length:]
 
  return combined_transcript
diff --git a/main.py b/main.py
@@ -100,11 +100,11 @@ def main():
  global_vars.speaker_audio_recorder.set_device(index=args.speaker_device_index)
 
  if args.disable_mic:
- print('[INFO] Disabling Microphone')
+ print('[INFO] Disabling Transcription from the Microphone')
  global_vars.user_audio_recorder.disable()
 
  if args.disable_speaker:
- print('[INFO] Disabling Speaker')
+ print('[INFO] Disabling Transcription from the speaker')
  global_vars.speaker_audio_recorder.disable()
 
  try:
@@ -153,21 +153,21 @@ def main():
 
  global_vars.speaker_audio_recorder.record_into_queue(global_vars.audio_queue)
  global_vars.freeze_state = [True]
- convo = conversation.Conversation()
+ global_vars.convo = conversation.Conversation()
 
  # Transcribe and Respond threads, both work on the same instance of the AudioTranscriber class
  global_vars.transcriber = AudioTranscriber(global_vars.user_audio_recorder.source,
  global_vars.speaker_audio_recorder.source,
  model,
- convo=convo)
- global_vars.audio_player = AudioPlayer(convo=convo)
+ convo=global_vars.convo)
+ global_vars.audio_player = AudioPlayer(convo=global_vars.convo)
  transcribe_thread = threading.Thread(target=global_vars.transcriber.transcribe_audio_queue,
  name='Transcribe',
  args=(global_vars.audio_queue,))
  transcribe_thread.daemon = True
  transcribe_thread.start()
 
- global_vars.responder = GPTResponder(convo=convo)
+ global_vars.responder = GPTResponder(convo=global_vars.convo)
 
  respond_thread = threading.Thread(target=global_vars.responder.respond_to_transcriber,
  name='Respond',