Allow users to pause audio transcription. Change the default for gett…

…ing responses to off. (#13)
SevaSk · vivekuppal · Jun 29, 2023 · Jun 29, 2023 · Jun 29, 2023 · Jun 29, 2023
commit d1b3c455760709df1aeb3030d29ad92b2b4b64ce
diff --git a/AudioRecorder.py b/AudioRecorder.py
@@ -59,4 +59,5 @@ def __init__(self):
  chunk_size=pyaudio.get_sample_size(pyaudio.paInt16),
  channels=default_speakers["maxInputChannels"])
  super().__init__(source=source, source_name="Speaker")
- self.adjust_for_noise("Default Speaker", "Please make or play some noise from the Default Speaker...")
+ self.adjust_for_noise("Default Speaker",
+ "Please make or play some noise from the Default Speaker...")
diff --git a/AudioTranscriber.py b/AudioTranscriber.py
@@ -18,6 +18,7 @@ def __init__(self, mic_source, speaker_source, model):
  self.transcript_data = {"You": [], "Speaker": []}
  self.transcript_changed_event = threading.Event()
  self.audio_model = model
+ self.transcribe = True # By default we start with transcription enabled
  self.audio_sources = {
  "You": {
  "sample_rate": mic_source.SAMPLE_RATE,
@@ -40,6 +41,11 @@ def __init__(self, mic_source, speaker_source, model):
  }
 
  def transcribe_audio_queue(self, audio_queue):
+ """Transcribe data from audio sources. In this case we have 2 sources, microphone, speaker.
+ Args:
+ audio_queue: queue object with reference to audio files
+
+ """
  while True:
  who_spoke, data, time_spoken = audio_queue.get()
  self.update_last_sample_and_phrase_status(who_spoke, data, time_spoken)
@@ -50,7 +56,8 @@ def transcribe_audio_queue(self, audio_queue):
  fd, path = tempfile.mkstemp(suffix=".wav")
  os.close(fd)
  source_info["process_data_func"](source_info["last_sample"], path)
- text = self.audio_model.get_transcription(path)
+ if self.transcribe:
+ text = self.audio_model.get_transcription(path)
  except Exception as exception:
  print(exception)
  finally:
@@ -61,6 +68,8 @@ def transcribe_audio_queue(self, audio_queue):
  self.transcript_changed_event.set()
 
  def update_last_sample_and_phrase_status(self, who_spoke, data, time_spoken):
+ if not self.transcribe:
+ return
  source_info = self.audio_sources[who_spoke]
  if source_info["last_spoken"] and time_spoken - source_info["last_spoken"] > timedelta(seconds=PHRASE_TIMEOUT):
  source_info["last_sample"] = bytes()
@@ -69,15 +78,19 @@ def update_last_sample_and_phrase_status(self, who_spoke, data, time_spoken):
  source_info["new_phrase"] = False
 
  source_info["last_sample"] += data
- source_info["last_spoken"] = time_spoken 
+ source_info["last_spoken"] = time_spoken
 
  def process_mic_data(self, data, temp_file_name):
+ if not self.transcribe:
+ return
  audio_data = sr.AudioData(data, self.audio_sources["You"]["sample_rate"], self.audio_sources["You"]["sample_width"])
  wav_data = io.BytesIO(audio_data.get_wav_data())
  with open(temp_file_name, 'w+b') as f:
  f.write(wav_data.read())
 
  def process_speaker_data(self, data, temp_file_name):
+ if not self.transcribe:
+ return
  with wave.open(temp_file_name, 'wb') as wf:
  wf.setnchannels(self.audio_sources["Speaker"]["channels"])
  p = pyaudio.PyAudio()
@@ -86,6 +99,12 @@ def process_speaker_data(self, data, temp_file_name):
  wf.writeframes(data)
 
  def update_transcript(self, who_spoke, text, time_spoken):
+ """Update transcript with new data
+ Args:
+ who_spoke: Person this audio is attributed to
+ text: Actual spken words
+ time_spoken: Time at which audio was taken, relative to start time
+ """
  source_info = self.audio_sources[who_spoke]
  transcript = self.transcript_data[who_spoke]
 
@@ -96,13 +115,22 @@ def update_transcript(self, who_spoke, text, time_spoken):
  transcript.append((f"{who_spoke}: [{text}]\n\n", time_spoken))
 
  def get_transcript(self, length: int = 0):
+ """Get the audio transcript
+ Args:
+ length: Get the last length elements from the audio transcript.
+ Default value = 0, gives the complete transcript
+ """
  combined_transcript = list(merge(
  self.transcript_data["You"], self.transcript_data["Speaker"],
  key=lambda x: x[1], reverse=False))
  combined_transcript = combined_transcript[-length:]
  return "".join([t[0] for t in combined_transcript])
 
  def clear_transcript_data(self):
+ """
+ Args:
+ length: Clear all data stored internally for audio transcript
+ """
  self.transcript_data["You"].clear()
  self.transcript_data["Speaker"].clear()
 

diff --git a/GPTResponder.py b/GPTResponder.py
@@ -39,10 +39,10 @@ def respond_to_transcriber(self, transcriber):
  transcriber.transcript_changed_event.clear()
  transcript_string = transcriber.get_transcript(length=MAX_PHRASES)
  response = generate_response_from_transcript(transcript_string)
- 
+
  end_time = time.time() # Measure end time
- execution_time = end_time - start_time # Calculate the time it took to execute the function
- 
+ execution_time = end_time - start_time # Calculate time to execute the function
+
  if response != '':
  self.response = response
 

diff --git a/README.md b/README.md
@@ -87,7 +87,7 @@ While Transcribe provides real-time transcription and optional response suggesti
 
 **Whisper Model**: If the --api flag is not used, we utilize the 'tiny' version of the Whisper ASR model, due to its low resource consumption and fast response times. However, this model may not be as accurate as the larger models in transcribing certain types of speech, including accents or uncommon words.
 
-**OpenAI Account**: If a paid OpenAI account with a valid Open API Key is not used, the command window displays the following error message repeatedly, though the application behvaior is not impacted in any way
+**OpenAI Account**: If a paid OpenAI account with a valid Open API Key is not used, the command window displays the following error message repeatedly, though the application behvaior is not impacted in any way.
 ```
 Incorrect API key provided: API_KEY. You can find your API key at https://platform.openai.com/account/api-keys.
 ```
@@ -102,8 +102,9 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
 
 ## ➕ Enhancements from base repository ➕
 - Do not need Open AI key, paid Open AI account to use the complete functionality
-- Allow contexttual information to provide customized responses to users
-- Transcribe any video
+- Allow users to add contextual information to provide customized responses to conversation
+- Allow to pause audio transcription
+- Transcribe audio of any video
 - Preserve all conversation text in UI
 - Allow saving conversation to file
 

diff --git a/main.py b/main.py
@@ -3,6 +3,7 @@
 from argparse import RawTextHelpFormatter
 from AudioTranscriber import AudioTranscriber
 from GPTResponder import GPTResponder
+import prompts
 import customtkinter as ctk
 import AudioRecorder
 import queue
@@ -25,7 +26,7 @@ def update_transcript_UI(transcriber, textbox):
  textbox.after(300, update_transcript_UI, transcriber, textbox)
 
 
-def update_response_UI(responder, textbox, update_interval_slider_label, 
+def update_response_UI(responder, textbox, update_interval_slider_label,
  update_interval_slider, freeze_state):
  if not freeze_state[0]:
  response = responder.response
@@ -66,8 +67,9 @@ def create_ui_components(root):
  response_textbox = ctk.CTkTextbox(root, width=300, font=("Arial", font_size),
  text_color='#639cdc', wrap="word")
  response_textbox.grid(row=0, column=1, padx=10, pady=20, sticky="nsew")
+ response_textbox.insert("0.0", prompts.INITIAL_RESPONSE)
 
- freeze_button = ctk.CTkButton(root, text="Freeze", command=None)
+ freeze_button = ctk.CTkButton(root, text="Suggest Response", command=None)
  freeze_button.grid(row=1, column=1, padx=10, pady=3, sticky="nsew")
 
  update_interval_slider_label = ctk.CTkLabel(root, text="", font=("Arial", 12),
@@ -82,14 +84,17 @@ def create_ui_components(root):
  copy_button = ctk.CTkButton(root, text="Copy Audio Transcript", command=None)
  copy_button.grid(row=2, column=0, padx=10, pady=3, sticky="nsew")
 
- save_file_button = ctk.CTkButton(root, text="Save to File", command=None)
+ save_file_button = ctk.CTkButton(root, text="Save Audio Transcript to File", command=None)
  save_file_button.grid(row=3, column=0, padx=10, pady=3, sticky="nsew")
 
+ transcript_button = ctk.CTkButton(root, text="Pause Transcript", command=None)
+ transcript_button.grid(row=4, column=0, padx=10, pady=3, sticky="nsew")
+
  # Order of returned components is important. For adding new components add new components
  # to the end
  return [transcript_textbox, response_textbox, update_interval_slider,
  update_interval_slider_label, freeze_button, copy_button,
- save_file_button]
+ save_file_button, transcript_button]
 
 
 def main():
@@ -100,7 +105,8 @@ def main():
  cmd_args.add_argument('-a', '--api', action='store_true',
  help='Use the online Open AI API for transcription.\
  \nThis option requires an API KEY and will consume Open AI credits.')
- cmd_args.add_argument('-m', '--model', action='store', choices=['tiny', 'base', 'small'], default='tiny',
+ cmd_args.add_argument('-m', '--model', action='store', choices=['tiny', 'base', 'small'],
+ default='tiny',
  help='Specify the model to use for transcription.'
  '\nBy default tiny model is part of the install.'
  '\nbase model has to be downloaded from the link https://drive.google.com/file/d/1E44DVjpfZX8tSrSagaDJXU91caZOkwa6/view?usp=drive_link'
@@ -127,6 +133,7 @@ def main():
  freeze_button = ui_components[4]
  copy_button = ui_components[5]
  save_file_button = ui_components[6]
+ transcript_button = ui_components[7]
 
  audio_queue = queue.Queue()
 
@@ -142,16 +149,16 @@ def main():
  # Transcribe and Respond threads, both work on the same instance of the AudioTranscriber class
  transcriber = AudioTranscriber(user_audio_recorder.source,
  speaker_audio_recorder.source, model)
- transcribe = threading.Thread(target=transcriber.transcribe_audio_queue,
- args=(audio_queue,))
- transcribe.daemon = True
- transcribe.start()
+ transcribe_thread = threading.Thread(target=transcriber.transcribe_audio_queue,
+  args=(audio_queue,))
+ transcribe_thread.daemon = True
+ transcribe_thread.start()
 
  responder = GPTResponder()
- respond = threading.Thread(target=responder.respond_to_transcriber,
- args=(transcriber,))
- respond.daemon = True
- respond.start()
+ respond_thread = threading.Thread(target=responder.respond_to_transcriber,
+  args=(transcriber,))
+ respond_thread.daemon = True
+ respond_thread.start()
 
  print("READY")
 
@@ -163,15 +170,15 @@ def main():
  root.grid_columnconfigure(1, weight=1)
 
  # Add the clear transcript button to the UI
- clear_transcript_button = ctk.CTkButton(root, text="Clear Transcript",
+ clear_transcript_button = ctk.CTkButton(root, text="Clear Audio Transcript",
  command=lambda: clear_context(transcriber, audio_queue))
  clear_transcript_button.grid(row=1, column=0, padx=10, pady=3, sticky="nsew")
 
- freeze_state = [False] # Using list to be able to change its content inside inner functions
+ freeze_state = [True] # Using list to be able to change its content inside inner functions
 
  def freeze_unfreeze():
- freeze_state[0] = not freeze_state[0] # Invert the freeze state
- freeze_button.configure(text="Unfreeze" if freeze_state[0] else "Freeze")
+ freeze_state[0] = not freeze_state[0] # Invert the state
+ freeze_button.configure(text="Suggest Response" if freeze_state[0] else "Do Not Suggest Response")
 
  freeze_button.configure(command=freeze_unfreeze)
 
@@ -187,6 +194,12 @@ def save_file():
 
  save_file_button.configure(command=save_file)
 
+ def set_transcript_state():
+ transcriber.transcribe = not transcriber.transcribe
+ transcript_button.configure(text="Pause Transcript" if transcriber.transcribe else "Start Transcript")
+
+ transcript_button.configure(command=set_transcript_state)
+
  update_interval_slider_label.configure(text=f"Update interval: \
  {update_interval_slider.get()} \
  seconds")