Allow user to select input, output audio devices (#48)

Allow user to select non default speaker, microphone using command line arguments. List of devices can be obtained using the -l option
SevaSk · vivekuppal · Jun 29, 2023 · Jun 29, 2023 · Jun 29, 2023 · Jun 29, 2023
commit 85d09ed2f4f3ea6693187962021830c265ecbbd5
diff --git a/AudioRecorder.py b/AudioRecorder.py
@@ -1,8 +1,8 @@
+from datetime import datetime
+from abc import abstractmethod
 import custom_speech_recognition as sr
 import pyaudiowpatch as pyaudio
-from datetime import datetime
 import app_logging as al
-from abc import abstractmethod
 
 RECORD_TIMEOUT = 3
 ENERGY_THRESHOLD = 1000
@@ -83,6 +83,8 @@ def print_detailed_audio_info(print_func=print):
 
 
 class BaseRecorder:
+ """Base class for Speaker, Microphone classes
+ """
  def __init__(self, source, source_name):
  root_logger.info(BaseRecorder.__name__)
  self.recorder = sr.Recognizer()
@@ -116,34 +118,63 @@ def record_callback(_, audio: sr.AudioData) -> None:
  phrase_time_limit=RECORD_TIMEOUT)
 
 
-class DefaultMicRecorder(BaseRecorder):
+class MicRecorder(BaseRecorder):
+ """Encapsultes the Microphone device audio input
+ """
  def __init__(self):
- root_logger.info(DefaultMicRecorder.__name__)
- with pyaudio.PyAudio() as p:
+ root_logger.info(MicRecorder.__name__)
+ with pyaudio.PyAudio() as py_audio:
  # WASAPI is windows specific
- wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
- default_mic = p.get_device_info_by_index(wasapi_info["defaultInputDevice"])
+ wasapi_info = py_audio.get_host_api_info_by_type(pyaudio.paWASAPI)
+ self.device_index = wasapi_info["defaultInputDevice"]
+ default_mic = py_audio.get_device_info_by_index(self.device_index)
 
  self.device_info = default_mic
 
  source = sr.Microphone(device_index=default_mic["index"],
  sample_rate=int(default_mic["defaultSampleRate"]),
  channels=default_mic["maxInputChannels"]
  )
+ self.source = source
  super().__init__(source=source, source_name="You")
  print(f'[INFO] Listening to sound from Microphone: {self.get_name()} ')
+ # This line is commented because in case of non default microphone it can occasionally take
+ # several minutes to execute, thus delaying the start of the application.
  # self.adjust_for_noise("Default Mic", "Please make some noise from the Default Mic...")
 
  def get_name(self):
- return self.device_info['name']
+ return f'#{self.device_index} - {self.device_info["name"]}'
+
+ def set_device(self, index: int):
+ """Set active device based on index.
+ """
+ root_logger.info(MicRecorder.set_device.__name__)
+ with pyaudio.PyAudio() as py_audio:
+ self.device_index = index
+ mic = py_audio.get_device_info_by_index(self.device_index)
+
+ self.device_info = mic
+
+ source = sr.Microphone(device_index=mic["index"],
+ sample_rate=int(mic["defaultSampleRate"]),
+ channels=mic["maxInputChannels"]
+ )
+ self.source = source
+ print(f'[INFO] Listening to sound from Microphone: {self.get_name()} ')
+ # This line is commented because in case of non default microphone it can occasionally take
+ # several minutes to execute, thus delaying the start of the application.
+ # self.adjust_for_noise("Default Mic", "Please make some noise from the Default Mic...")
 
 
-class DefaultSpeakerRecorder(BaseRecorder):
+class SpeakerRecorder(BaseRecorder):
+ """Encapsultes the Speaer device audio input
+ """
  def __init__(self):
- root_logger.info(DefaultSpeakerRecorder.__name__)
+ root_logger.info(SpeakerRecorder.__name__)
  with pyaudio.PyAudio() as p:
  wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
- default_speakers = p.get_device_info_by_index(wasapi_info["defaultOutputDevice"])
+ self.device_index = wasapi_info["defaultOutputDevice"]
+ default_speakers = p.get_device_info_by_index(self.device_index)
 
  if not default_speakers["isLoopbackDevice"]:
  for loopback in p.get_loopback_device_info_generator():
@@ -163,10 +194,38 @@ def __init__(self):
  super().__init__(source=source, source_name="Speaker")
  print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ')
  self.adjust_for_noise("Default Speaker",
- "Please make or play some noise from the Default Speaker...")
+ "Please play sound from Default Speaker...")
 
  def get_name(self):
- return self.device_info['name']
+ return f'#{self.device_index} - {self.device_info["name"]}'
+
+ def set_device(self, index: int):
+ """Set active device based on index.
+ """
+ root_logger.info(SpeakerRecorder.set_device.__name__)
+ with pyaudio.PyAudio() as p:
+ self.device_index = index
+ speakers = p.get_device_info_by_index(self.device_index)
+
+ if not speakers["isLoopbackDevice"]:
+ for loopback in p.get_loopback_device_info_generator():
+ if speakers["name"] in loopback["name"]:
+ speakers = loopback
+ break
+ else:
+ print("[ERROR] No loopback device found.")
+
+ self.device_info = speakers
+
+ source = sr.Microphone(speaker=True,
+ device_index=speakers["index"],
+ sample_rate=int(speakers["defaultSampleRate"]),
+ chunk_size=pyaudio.get_sample_size(pyaudio.paInt16),
+ channels=speakers["maxInputChannels"])
+ self.source = source
+ print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ')
+ self.adjust_for_noise("Speaker",
+ f"Please play sound from selected Speakers {self.get_name()}...")
 
 
 if __name__ == "__main__":

diff --git a/GlobalVars.py b/GlobalVars.py
@@ -15,8 +15,8 @@ class TranscriptionGlobals(Singleton.Singleton):
  """
 
  audio_queue: queue.Queue = None
- user_audio_recorder: AudioRecorder.DefaultMicRecorder = None
- speaker_audio_recorder: AudioRecorder.DefaultSpeakerRecorder = None
+ user_audio_recorder: AudioRecorder.MicRecorder = None
+ speaker_audio_recorder: AudioRecorder.SpeakerRecorder = None
  # Global for transcription from speaker, microphone
  transcriber: AudioTranscriber = None
  # Global for responses from openAI API
@@ -33,8 +33,8 @@ def __init__(self, key: str = 'API_KEY'):
  if self.audio_queue is None:
  self.audio_queue = queue.Queue()
  if self.user_audio_recorder is None:
- self.user_audio_recorder = AudioRecorder.DefaultMicRecorder()
+ self.user_audio_recorder = AudioRecorder.MicRecorder()
  if self.speaker_audio_recorder is None:
- self.speaker_audio_recorder = AudioRecorder.DefaultSpeakerRecorder()
+ self.speaker_audio_recorder = AudioRecorder.SpeakerRecorder()
  if self.api_key is None:
  self.api_key = key
diff --git a/main.py b/main.py
@@ -19,6 +19,8 @@
 
 
 def main():
+ """Primary method to run transcribe
+ """
  # Set up all arguments
  cmd_args = argparse.ArgumentParser(description='Command Line Arguments for Transcribe',
  formatter_class=RawTextHelpFormatter)
@@ -33,7 +35,7 @@ def main():
  cmd_args.add_argument('-m', '--model', action='store', choices=[
  'tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2', 'large'],
  default='tiny',
- help='Specify the model to use for transcription.'
+ help='Specify the LLM to use for transcription.'
  '\nBy default tiny english model is part of the install.'
  '\ntiny multi-lingual model has to be downloaded from the link '
  'https://drive.google.com/file/d/1M4AFutTmQROaE9xk2jPc5Y4oFRibHhEh/view?usp=drive_link'
@@ -59,6 +61,10 @@ def main():
  cmd_args.add_argument('-l', '--list_devices', action='store_true',
  help='List all audio drivers and audio devices on this machine. \
  \nUse this list index to select the microphone, speaker device for transcription.')
+ cmd_args.add_argument('-mi', '--mic_device_index', action='store', default=None, type=int,
+ help='Device index of the microphone for capturing sound.')
+ cmd_args.add_argument('-si', '--speaker_device_index', action='store', default=None, type=int,
+ help='Device index of the speaker for capturing sound.')
  args = cmd_args.parse_args()
 
  # Initiate config
@@ -78,6 +84,14 @@ def main():
  # Initiate logging
  log_listener = app_logging.initiate_log(config=config)
 
+ if args.mic_device_index is not None:
+ print('Override default microphone with device specified on command line.')
+ global_vars.user_audio_recorder.set_device(index=args.mic_device_index)
+
+ if args.speaker_device_index is not None:
+ print('Override default speaker with device specified on command line.')
+ global_vars.speaker_audio_recorder.set_device(index=args.speaker_device_index)
+
  try:
  subprocess.run(["ffmpeg", "-version"],
  stdout=subprocess.DEVNULL,
@@ -109,16 +123,14 @@ def main():
  root = ctk.CTk()
  ui_components = ui.create_ui_components(root)
  transcript_textbox = ui_components[0]
- response_textbox = ui_components[1]
+ global_vars.response_textbox = ui_components[1]
  update_interval_slider = ui_components[2]
  update_interval_slider_label = ui_components[3]
  global_vars.freeze_button = ui_components[4]
  lang_combobox = ui_components[5]
- filemenu = ui_components[6]
+ global_vars.filemenu = ui_components[6]
  response_now_button = ui_components[7]
 
- global_vars.filemenu = filemenu
- global_vars.response_textbox = response_textbox
  global_vars.user_audio_recorder.record_into_queue(global_vars.audio_queue)
 
  time.sleep(2)
@@ -164,8 +176,9 @@ def main():
  lang_combobox.configure(command=model.change_lang)
 
  ui.update_transcript_ui(global_vars.transcriber, transcript_textbox)
- ui.update_response_ui(global_vars.responder, response_textbox, update_interval_slider_label,
- update_interval_slider, global_vars.freeze_state)
+ ui.update_response_ui(global_vars.responder, global_vars.response_textbox,
+ update_interval_slider_label, update_interval_slider,
+ global_vars.freeze_state)
 
  root.mainloop()
  log_listener.stop()

diff --git a/ui.py b/ui.py
@@ -191,7 +191,7 @@ def create_ui_components(root):
  freeze_button = ctk.CTkButton(root, text="Suggest Responses Continuously", command=None)
  freeze_button.grid(row=1, column=1, padx=10, pady=3, sticky="nsew")
 
- response_now_button = ctk.CTkButton(root, text="Suggest Responses Now", command=None)
+ response_now_button = ctk.CTkButton(root, text="Suggest Response Now", command=None)
  response_now_button.grid(row=2, column=1, padx=10, pady=3, sticky="nsew")
 
  update_interval_slider_label = ctk.CTkLabel(root, text="", font=("Arial", 12),