Easy install for non developers on windows (#20)

- Add script for generation of binaries - Separate transcription related vars into a singleton global class - Further remove ui code from main.py class - Add a File, Edit menu
SevaSk · vivekuppal · Jun 29, 2023 · Jun 29, 2023 · Jun 29, 2023 · Jun 29, 2023
commit e5cda88360605ef420be2ba8d36bbe071e9d36b5
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 __pycache__/
 *.wav
-.venv/
+.venv/
+venv
+output
diff --git a/AudioRecorder.py b/AudioRecorder.py
@@ -26,11 +26,12 @@ def adjust_for_noise(self, device_name, msg):
  print(f"[INFO] Completed ambient noise adjustment for {device_name}.")
 
  def record_into_queue(self, audio_queue):
- def record_callback(_, audio:sr.AudioData) -> None:
+ def record_callback(_, audio: sr.AudioData) -> None:
  data = audio.get_raw_data()
  audio_queue.put((self.source_name, data, datetime.utcnow()))
 
- self.recorder.listen_in_background(self.source, record_callback, phrase_time_limit=RECORD_TIMEOUT)
+ self.recorder.listen_in_background(self.source, record_callback,
+ phrase_time_limit=RECORD_TIMEOUT)
 
 
 class DefaultMicRecorder(BaseRecorder):
@@ -44,15 +45,15 @@ def __init__(self):
  with pyaudio.PyAudio() as p:
  wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
  default_speakers = p.get_device_info_by_index(wasapi_info["defaultOutputDevice"])
- 
+
  if not default_speakers["isLoopbackDevice"]:
  for loopback in p.get_loopback_device_info_generator():
  if default_speakers["name"] in loopback["name"]:
  default_speakers = loopback
  break
  else:
  print("[ERROR] No loopback device found.")
- 
+
  source = sr.Microphone(speaker=True,
  device_index= default_speakers["index"],
  sample_rate=int(default_speakers["defaultSampleRate"]),

diff --git a/AudioTranscriber.py b/AudioTranscriber.py
@@ -5,7 +5,6 @@
 import wave
 import tempfile
 import whisper
-import torch
 import custom_speech_recognition as sr
 import pyaudiowpatch as pyaudio
 from heapq import merge

diff --git a/README.md b/README.md
@@ -79,6 +79,24 @@ Upon initiation, Transcribe will begin transcribing microphone input and speaker
 
 The --api flag will use the whisper api for transcriptions. This significantly enhances transcription speed and accuracy, and it works in most languages (rather than just English without the flag). However, keep in mind, using the Whisper API consumes OpenAI credits than using the local model. This increased cost is attributed to the advanced features and capabilities that the Whisper API provides. Despite the additional expense, the substantial improvements in speed and transcription accuracy may make it a worthwhile for your use case.
 
+### For Non-Developers (Windows)
+**Prerequisitess:**
+
+Install Winrar from https://www.win-rar.com/.
+
+In the file ```generate_binary.bat``` replace these paths at the top of the file to paths specific to your machine. 
+
+```
+SET SOURCE_DIR=D:\Code\transcribe 
+SET OUTPUT_DIR=D:\Code\transcribe\output
+SET LIBSITE_PACAGES_DIR=D:\Code\transcribe\venv\Lib\site-packages
+SET EXECUTABLE_NAME=transcribe.exe
+SET ZIP_FILE_DIR=D:\Code\transcribe\transcribe.rar
+SET WINRAR=C:\Program Files\WinRAR\winRAR.exe
+```
+
+Run ```generate_binary.bat``` file by replacing paths at the top of the file to the ones in your local machine. It should generate a zip file with everything compiled. To run the program simply go to zip file > transcribe.exe.
+
 ### ⚡️ Limitations ⚡️
 
 While Transcribe provides real-time transcription and optional response suggestions, there are several known limitations to its functionality that you should be aware of:

diff --git a/custom_speech_recognition/__init__.py b/custom_speech_recognition/__init__.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
-"""Library for performing speech recognition, with support for several engines and APIs, online and offline."""
+"""Library for performing speech recognition, with support for several
+ engines and APIs, online and offline."""
 
 import io
 import os
@@ -57,27 +58,40 @@ def __exit__(self, exc_type, exc_value, traceback):
 
 class Microphone(AudioSource):
  """
- Creates a new ``Microphone`` instance, which represents a physical microphone on the computer. Subclass of ``AudioSource``.
+ Creates a new ``Microphone`` instance, which represents a physical microphone on the computer.
+ Subclass of ``AudioSource``.
 
  This will throw an ``AttributeError`` if you don't have PyAudio 0.2.11 or later installed.
 
- If ``device_index`` is unspecified or ``None``, the default microphone is used as the audio source. Otherwise, ``device_index`` should be the index of the device to use for audio input.
+ If ``device_index`` is unspecified or ``None``, the default microphone is used as the
+ audio source.
 
- A device index is an integer between 0 and ``pyaudio.get_device_count() - 1`` (assume we have used ``import pyaudio`` beforehand) inclusive. It represents an audio device such as a microphone or speaker. See the `PyAudio documentation <http:https://people.csail.mit.edu/hubert/pyaudio/docs/>`__ for more details.
+ Otherwise, ``device_index`` should be the index of the device to use for audio input.
+ A device index is an integer between 0 and ``pyaudio.get_device_count() - 1`` (assume
+ we have used ``import pyaudio`` beforehand) inclusive. It represents an audio device such
+ as a microphone or speaker.
 
- The microphone audio is recorded in chunks of ``chunk_size`` samples, at a rate of ``sample_rate`` samples per second (Hertz). If not specified, the value of ``sample_rate`` is determined automatically from the system's microphone settings.
+ See the `PyAudio documentation <http:https://people.csail.mit.edu/hubert/pyaudio/docs/>`__
+ for more details.
+ The microphone audio is recorded in chunks of ``chunk_size`` samples, at a rate of
+ ``sample_rate`` samples per second (Hertz). If not specified, the value of ``sample_rate``
+ is determined automatically from the system's microphone settings.
 
- Higher ``sample_rate`` values result in better audio quality, but also more bandwidth (and therefore, slower recognition). Additionally, some CPUs, such as those in older Raspberry Pi models, can't keep up if this value is too high.
+ Higher ``sample_rate`` values result in better audio quality, but also more bandwidth
+ (and therefore, slower recognition). Additionally, some CPUs, such as those in older
+ Raspberry Pi models, can't keep up if this value is too high.
 
- Higher ``chunk_size`` values help avoid triggering on rapidly changing ambient noise, but also makes detection less sensitive. This value, generally, should be left at its default.
+ Higher ``chunk_size`` values help avoid triggering on rapidly changing ambient noise,
+ but also makes detection less sensitive. This value, generally, should be left at its default.
  """
- def __init__(self, device_index=None, sample_rate=None, chunk_size=1024, speaker=False, channels = 1):
+ def __init__(self, device_index=None, sample_rate=None, chunk_size=1024,
+ speaker=False, channels=1):
  assert device_index is None or isinstance(device_index, int), "Device index must be None or an integer"
  assert sample_rate is None or (isinstance(sample_rate, int) and sample_rate > 0), "Sample rate must be None or a positive integer"
  assert isinstance(chunk_size, int) and chunk_size > 0, "Chunk size must be a positive integer"
 
  # set up PyAudio
- self.speaker=speaker
+ self.speaker = speaker
  self.pyaudio_module = self.get_pyaudio()
  audio = self.pyaudio_module.PyAudio()
  try:
@@ -104,7 +118,8 @@ def __init__(self, device_index=None, sample_rate=None, chunk_size=1024, speaker
  @staticmethod
  def get_pyaudio():
  """
- Imports the pyaudio module and checks its version. Throws exceptions if pyaudio can't be found or a wrong version is installed
+ Imports the pyaudio module and checks its version. Throws exceptions if pyaudio
+ can't be found or a wrong version is installed
  """
  try:
  import pyaudiowpatch as pyaudio
@@ -118,9 +133,12 @@ def get_pyaudio():
  @staticmethod
  def list_microphone_names():
  """
- Returns a list of the names of all available microphones. For microphones where the name can't be retrieved, the list entry contains ``None`` instead.
+ Returns a list of the names of all available microphones. For microphones where
+ the name can't be retrieved, the list entry contains ``None`` instead.
 
- The index of each microphone's name in the returned list is the same as its device index when creating a ``Microphone`` instance - if you want to use the microphone at index 3 in the returned list, use ``Microphone(device_index=3)``.
+ The index of each microphone's name in the returned list is the same as its
+ device index when creating a ``Microphone`` instance - if you want to use the
+ microphone at index 3 in the returned list, use ``Microphone(device_index=3)``.
  """
  audio = Microphone.get_pyaudio().PyAudio()
  try:
@@ -135,9 +153,15 @@ def list_microphone_names():
  @staticmethod
  def list_working_microphones():
  """
- Returns a dictionary mapping device indices to microphone names, for microphones that are currently hearing sounds. When using this function, ensure that your microphone is unmuted and make some noise at it to ensure it will be detected as working.
-
- Each key in the returned dictionary can be passed to the ``Microphone`` constructor to use that microphone. For example, if the return value is ``{3: "HDA Intel PCH: ALC3232 Analog (hw:1,0)"}``, you can do ``Microphone(device_index=3)`` to use that microphone.
+ Returns a dictionary mapping device indices to microphone names,
+ for microphones that are currently hearing sounds. When using this function,
+ ensure that your microphone is unmuted and make some noise at it to ensure
+ it will be detected as working.
+
+ Each key in the returned dictionary can be passed to the ``Microphone``
+ constructor to use that microphone. For example, if the return value
+ is ``{3: "HDA Intel PCH: ALC3232 Analog (hw:1,0)"}``, you can do
+ ``Microphone(device_index=3)`` to use that microphone.
  """
  pyaudio_module = Microphone.get_pyaudio()
  audio = pyaudio_module.PyAudio()
@@ -225,17 +249,28 @@ def close(self):
 
 class AudioFile(AudioSource):
  """
- Creates a new ``AudioFile`` instance given a WAV/AIFF/FLAC audio file ``filename_or_fileobject``. Subclass of ``AudioSource``.
+ Creates a new ``AudioFile`` instance given a WAV/AIFF/FLAC file ``filename_or_fileobject``.
+ Subclass of ``AudioSource``.
 
- If ``filename_or_fileobject`` is a string, then it is interpreted as a path to an audio file on the filesystem. Otherwise, ``filename_or_fileobject`` should be a file-like object such as ``io.BytesIO`` or similar.
+ If ``filename_or_fileobject`` is a string, then it is interpreted as a path
+ to an audio file on the filesystem.
+ Otherwise, ``filename_or_fileobject`` should be a file-like object such
+ as ``io.BytesIO`` or similar.
 
- Note that functions that read from the audio (such as ``recognizer_instance.record`` or ``recognizer_instance.listen``) will move ahead in the stream. For example, if you execute ``recognizer_instance.record(audiofile_instance, duration=10)`` twice, the first time it will return the first 10 seconds of audio, and the second time it will return the 10 seconds of audio right after that. This is always reset to the beginning when entering an ``AudioFile`` context.
+ Note that functions that read from the audio (such as ``recognizer_instance.record``
+ or ``recognizer_instance.listen``) will move ahead in the stream. For example,
+ if you execute ``recognizer_instance.record(audiofile_instance, duration=10)``
+ twice, the first time it will return the first 10 seconds of audio, and the second
+ time it will return the 10 seconds of audio right after that. This is always reset
+ to the beginning when entering an ``AudioFile`` context.
 
- WAV files must be in PCM/LPCM format; WAVE_FORMAT_EXTENSIBLE and compressed WAV are not supported and may result in undefined behaviour.
+ WAV files must be in PCM/LPCM format; WAVE_FORMAT_EXTENSIBLE and compressed WAV
+ are not supported and may result in undefined behaviour.
 
  Both AIFF and AIFF-C (compressed AIFF) formats are supported.
 
- FLAC files must be in native FLAC format; OGG-FLAC is not supported and may result in undefined behaviour.
+ FLAC files must be in native FLAC format; OGG-FLAC is not supported and
+ may result in undefined behaviour.
  """
 
  def __init__(self, filename_or_fileobject):
@@ -1518,23 +1553,23 @@ def recognize_whisper(self, audio_data, model="base", show_dict=False, load_opti
  return result["text"]
 
  recognize_whisper_api = whisper.recognize_whisper_api
- 
+
  def recognize_vosk(self, audio_data, language='en'):
  from vosk import Model, KaldiRecognizer
- 
+
  assert isinstance(audio_data, AudioData), "Data must be audio data"
- 
+
  if not hasattr(self, 'vosk_model'):
  if not os.path.exists("model"):
  return "Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder."
- exit (1)
+ exit(1)
  self.vosk_model = Model("model")
 
- rec = KaldiRecognizer(self.vosk_model, 16000);
- 
- rec.AcceptWaveform(audio_data.get_raw_data(convert_rate=16000, convert_width=2));
+ rec = KaldiRecognizer(self.vosk_model, 16000)
+
+ rec.AcceptWaveform(audio_data.get_raw_data(convert_rate=16000, convert_width=2))
  finalRecognition = rec.FinalResult()
- 
+
  return finalRecognition
 
 
@@ -1579,18 +1614,24 @@ def recognize_api(self, audio_data, client_access_token, language="en", session_
  while True:
  boundary = uuid.uuid4().hex
  if boundary.encode("utf-8") not in wav_data: break
- if session_id is None: session_id = uuid.uuid4().hex
+ if session_id is None: 
+ session_id = uuid.uuid4().hex
  data = b"--" + boundary.encode("utf-8") + b"\r\n" + b"Content-Disposition: form-data; name=\"request\"\r\n" + b"Content-Type: application/json\r\n" + b"\r\n" + b"{\"v\": \"20150910\", \"sessionId\": \"" + session_id.encode("utf-8") + b"\", \"lang\": \"" + language.encode("utf-8") + b"\"}\r\n" + b"--" + boundary.encode("utf-8") + b"\r\n" + b"Content-Disposition: form-data; name=\"voiceData\"; filename=\"audio.wav\"\r\n" + b"Content-Type: audio/wav\r\n" + b"\r\n" + wav_data + b"\r\n" + b"--" + boundary.encode("utf-8") + b"--\r\n"
  request = Request(url, data=data, headers={"Authorization": "Bearer {}".format(client_access_token), "Content-Length": str(len(data)), "Expect": "100-continue", "Content-Type": "multipart/form-data; boundary={}".format(boundary)})
- try: response = urlopen(request, timeout=10)
- except HTTPError as e: raise RequestError("recognition request failed: {}".format(e.reason))
- except URLError as e: raise RequestError("recognition connection failed: {}".format(e.reason))
+ try: 
+ response = urlopen(request, timeout=10)
+ except HTTPError as e:
+ raise RequestError("recognition request failed: {}".format(e.reason))
+ except URLError as e:
+ raise RequestError("recognition connection failed: {}".format(e.reason))
  response_text = response.read().decode("utf-8")
  result = json.loads(response_text)
- if show_all: return result
+ if show_all:
+ return result
  if "status" not in result or "errorType" not in result["status"] or result["status"]["errorType"] != "success":
  raise UnknownValueError()
  return result["result"]["resolvedQuery"]
 
-
-Recognizer.recognize_api = classmethod(recognize_api) # API.AI Speech Recognition is deprecated/not recommended as of 3.5.0, and currently is only optionally available for paid plans
+# API.AI Speech Recognition is deprecated/not recommended as of 3.5.0, and currently
+# is only optionally available for paid plans
+Recognizer.recognize_api = classmethod(recognize_api)
diff --git a/generate_binary.bat b/generate_binary.bat
@@ -0,0 +1,45 @@
+REM Define variables for different hard coded paths (Change everything to your local PATHs)
+REM SET SOURCE_DIR=D:\Code\transcribe
+REM SET OUTPUT_DIR=D:\Code\transcribe\output
+REM SET LIBSITE_PACAGES_DIR=D:\Code\transcribe\venv\Lib\site-packages
+REM SET EXECUTABLE_NAME=transcribe.exe
+REM SET ZIP_FILE_DIR=D:\Code\transcribe\transcribe.rar
+REM SET ZIP_LOCATION=D:\Code\transcribe\output\dist\transcribe.exe
+REM SET WINRAR=C:\Program Files\WinRAR\winRAR.exe
+
+REM Define variables for different hard coded paths (Change everything to your local PATHs)
+SET SOURCE_DIR=C:\git\transcribe
+REM Contents of output dir are deleted at the end of the script
+SET OUTPUT_DIR=C:\git\output
+SET LIBSITE_PACAGES_DIR=C:\pyenv\transcribe\Lib\site-packages
+SET EXECUTABLE_NAME=transcribe.exe
+SET ZIP_FILE_DIR=C:\git\output\transcribe.rar
+SET ZIP_LOCATION=C:\git\output\dist\transcribe.exe
+SET WINRAR=C:\Program Files\WinRAR\winRAR.exe
+
+REM pyinstaller --clean --noconfirm --specpath C:\\git\\output --distpath C:\\git\\output\dist -n transcribe.exe --log-level DEBUG --recursive-copy-metadata "openai-whisper" main.py
+
+SET PYINSTALLER_DIST_PATH=%OUTPUT_DIR%\dist
+SET PYINSTALLER_TEMP_PATH=%OUTPUT_DIR%\temp
+ECHO %PYINSTALLER_DIST_PATH%
+
+pyinstaller --clean --noconfirm --workpath %PYINSTALLER_TEMP_PATH% --specpath %OUTPUT_DIR% --distpath %PYINSTALLER_DIST_PATH% -n %EXECUTABLE_NAME% --log-level DEBUG main.py
+
+SET ASSETS_DIR_SRC=%LIBSITE_PACAGES_DIR%\whisper\assets\
+SET ASSETS_DIR_DEST=%PYINSTALLER_DIST_PATH%\%EXECUTABLE_NAME%\whisper\assets
+
+REM ensure the appropriate directories exist
+if not exist %PYINSTALLER_DIST_PATH%\%EXECUTABLE_NAME%\whisper mkdir %PYINSTALLER_DIST_PATH%\%EXECUTABLE_NAME%\whisper
+if not exist %ASSETS_DIR_DEST% mkdir %ASSETS_DIR_DEST%
+
+REM Copy appropriate files to the dir
+copy %SOURCE_DIR%\tiny.en.pt %OUTPUT_DIR%\dist\%EXECUTABLE_NAME%\tiny.en.pt
+copy %ASSETS_DIR_SRC%\mel_filters.npz %ASSETS_DIR_DEST%
+copy %ASSETS_DIR_SRC%\gpt2.tiktoken %ASSETS_DIR_DEST%
+
+REM Code for zipping the final package
+"%WINRAR%" a -r -ep1 -df "%ZIP_FILE_DIR%" "%ZIP_LOCATION%" 
+
+REM Remove the temp, dist folders
+rmdir /S /Q %PYINSTALLER_DIST_PATH%
+rmdir /S /Q %PYINSTALLER_TEMP_PATH%
diff --git a/globals.py b/globals.py
@@ -0,0 +1,33 @@
+import queue
+from AudioTranscriber import AudioTranscriber
+from GPTResponder import GPTResponder
+import AudioRecorder
+import customtkinter as ctk
+
+
+class TranscriptionGlobals(object):
+ # Global constants for audio processing. It is implemented as a singleton
+
+ audio_queue: queue.Queue = None
+ user_audio_recorder: AudioRecorder.DefaultMicRecorder = None
+ speaker_audio_recorder: AudioRecorder.DefaultSpeakerRecorder = None
+ # Global for transcription from speaker, microphone
+ transcriber: AudioTranscriber = None
+ # Global for responses from openAI API
+ responder: GPTResponder = None
+ # Global for determining whether to seek responses from openAI API
+ freeze_state: list = None
+ freeze_button: ctk.CTkButton = None
+
+ def __new__(cls):
+ if not hasattr(cls, 'instance'):
+ cls.instance = super(TranscriptionGlobals, cls).__new__(cls)
+ return cls.instance
+
+ def __init__(self):
+ if self.audio_queue is None:
+ self.audio_queue = queue.Queue()
+ if self.user_audio_recorder is None:
+ self.user_audio_recorder = AudioRecorder.DefaultMicRecorder()
+ if self.speaker_audio_recorder is None:
+ self.speaker_audio_recorder = AudioRecorder.DefaultSpeakerRecorder()