Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Continuous mode not working #148

Closed
wants to merge 40 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
9fde685
Update README.md for Transcribe
vivekuppal Jun 29, 2023
1483fea
Merge pull request #1 from vivekuppal/vu-readme-updates
vivekuppal Jun 29, 2023
391d728
Allow usage without a valid OPEN API key. (#2)
vivekuppal Jun 29, 2023
ab4245d
Update README.md (#3)
vivekuppal Jun 29, 2023
ebb6f2f
Allow user to choose model. Add arguments to main file.
vivekuppal Jun 29, 2023
8f5a595
Code clean up, add linting. (#4)
vivekuppal Jun 30, 2023
59d5c91
UI Text Chronology (#5)
vivekuppal Jun 30, 2023
f772bb8
Update readme with Enhancements. Allow copy of text from UI window. R…
vivekuppal Jun 30, 2023
87a38b1
Save conversation to text. (#9)
vivekuppal Jun 30, 2023
65d6dcf
Add Contextual Information to Responses (#11)
vivekuppal Jun 30, 2023
d1b3c45
Allow users to pause audio transcription. Change the default for gett…
vivekuppal Jul 3, 2023
cfca51a
Update main.py (#15)
abhinavuppal1 Jul 11, 2023
152bad3
Code reorg to separate UI code (#16)
vivekuppal Jul 12, 2023
addf17f
Add support for multiple languages (#18)
vivekuppal Jul 12, 2023
e5cda88
Easy install for non developers on windows (#20)
vivekuppal Jul 18, 2023
9896c1c
Disabled winrar UI (#22)
Adarsha-gg Jul 18, 2023
901501b
When using API, we do not need to specify language, absorb the lang p…
vivekuppal Jul 18, 2023
bd48b61
Language combo fix (#26)
Adarsha-gg Jul 19, 2023
7c9ca88
Added gdrive (#27)
Adarsha-gg Jul 19, 2023
2429c97
Allow usage of API Key in installed version of Transcribe (#28)
vivekuppal Jul 19, 2023
12ef846
updated the drive link (#30)
Adarsha-gg Jul 20, 2023
4be26c7
Add a duration class to easily measure the time taken for an operatio…
vivekuppal Jul 21, 2023
6e53b31
--api option was not working correctly (#34)
vivekuppal Jul 21, 2023
bd42b8c
Initial unit tests for the speech recognition library (#36)
vivekuppal Jul 24, 2023
af87eff
user reported defect fixes. (#39)
vivekuppal Jul 26, 2023
26cfaad
Optimize LLM usage (#40)
vivekuppal Jul 26, 2023
f8d5857
Bug fixes for exceptions observed during usage. Add further plumbing …
vivekuppal Jul 27, 2023
1356a78
Add logging infrastructure (#42)
vivekuppal Jul 27, 2023
a1cc48b
Get Response from LLM on demand (#44)
vivekuppal Jul 28, 2023
ea5f392
Models from open ai site (#43)
Adarsha-gg Jul 28, 2023
b4e03a4
List all active devices (#45)
vivekuppal Aug 1, 2023
85d09ed
Allow user to select input, output audio devices (#48)
vivekuppal Aug 21, 2023
28d1e9a
Disable mic speaker selectively (#49)
vivekuppal Aug 23, 2023
e48bdb8
Add Audio Response for LLM generated content (#50)
vivekuppal Aug 27, 2023
6baa77f
Update, upload latest binaries (#54)
Adarsha-gg Aug 30, 2023
fa55416
Multiturn prompts, bug fixes (#55)
vivekuppal Sep 5, 2023
ce5a1e1
Allow enable/disable speaker and microphone from UI (#56)
Adarsha-gg Sep 6, 2023
e445856
Update gdrive link (#58)
Adarsha-gg Sep 7, 2023
b50f58c
Bring readme up to date with current functionality. Describe content …
vivekuppal Sep 8, 2023
a7ea2cc
Continuous mode broke after updates to the UI.
vivekuppal Sep 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Allow users to pause audio transcription. Change the default for gett…
…ing responses to off. (#13)
  • Loading branch information
vivekuppal committed Jul 3, 2023
commit d1b3c455760709df1aeb3030d29ad92b2b4b64ce
3 changes: 2 additions & 1 deletion AudioRecorder.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,5 @@ def __init__(self):
chunk_size=pyaudio.get_sample_size(pyaudio.paInt16),
channels=default_speakers["maxInputChannels"])
super().__init__(source=source, source_name="Speaker")
self.adjust_for_noise("Default Speaker", "Please make or play some noise from the Default Speaker...")
self.adjust_for_noise("Default Speaker",
"Please make or play some noise from the Default Speaker...")
32 changes: 30 additions & 2 deletions AudioTranscriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def __init__(self, mic_source, speaker_source, model):
self.transcript_data = {"You": [], "Speaker": []}
self.transcript_changed_event = threading.Event()
self.audio_model = model
self.transcribe = True # By default we start with transcription enabled
self.audio_sources = {
"You": {
"sample_rate": mic_source.SAMPLE_RATE,
Expand All @@ -40,6 +41,11 @@ def __init__(self, mic_source, speaker_source, model):
}

def transcribe_audio_queue(self, audio_queue):
"""Transcribe data from audio sources. In this case we have 2 sources, microphone, speaker.
Args:
audio_queue: queue object with reference to audio files

"""
while True:
who_spoke, data, time_spoken = audio_queue.get()
self.update_last_sample_and_phrase_status(who_spoke, data, time_spoken)
Expand All @@ -50,7 +56,8 @@ def transcribe_audio_queue(self, audio_queue):
fd, path = tempfile.mkstemp(suffix=".wav")
os.close(fd)
source_info["process_data_func"](source_info["last_sample"], path)
text = self.audio_model.get_transcription(path)
if self.transcribe:
text = self.audio_model.get_transcription(path)
except Exception as exception:
print(exception)
finally:
Expand All @@ -61,6 +68,8 @@ def transcribe_audio_queue(self, audio_queue):
self.transcript_changed_event.set()

def update_last_sample_and_phrase_status(self, who_spoke, data, time_spoken):
if not self.transcribe:
return
source_info = self.audio_sources[who_spoke]
if source_info["last_spoken"] and time_spoken - source_info["last_spoken"] > timedelta(seconds=PHRASE_TIMEOUT):
source_info["last_sample"] = bytes()
Expand All @@ -69,15 +78,19 @@ def update_last_sample_and_phrase_status(self, who_spoke, data, time_spoken):
source_info["new_phrase"] = False

source_info["last_sample"] += data
source_info["last_spoken"] = time_spoken
source_info["last_spoken"] = time_spoken

def process_mic_data(self, data, temp_file_name):
if not self.transcribe:
return
audio_data = sr.AudioData(data, self.audio_sources["You"]["sample_rate"], self.audio_sources["You"]["sample_width"])
wav_data = io.BytesIO(audio_data.get_wav_data())
with open(temp_file_name, 'w+b') as f:
f.write(wav_data.read())

def process_speaker_data(self, data, temp_file_name):
if not self.transcribe:
return
with wave.open(temp_file_name, 'wb') as wf:
wf.setnchannels(self.audio_sources["Speaker"]["channels"])
p = pyaudio.PyAudio()
Expand All @@ -86,6 +99,12 @@ def process_speaker_data(self, data, temp_file_name):
wf.writeframes(data)

def update_transcript(self, who_spoke, text, time_spoken):
"""Update transcript with new data
Args:
who_spoke: Person this audio is attributed to
text: Actual spken words
time_spoken: Time at which audio was taken, relative to start time
"""
source_info = self.audio_sources[who_spoke]
transcript = self.transcript_data[who_spoke]

Expand All @@ -96,13 +115,22 @@ def update_transcript(self, who_spoke, text, time_spoken):
transcript.append((f"{who_spoke}: [{text}]\n\n", time_spoken))

def get_transcript(self, length: int = 0):
"""Get the audio transcript
Args:
length: Get the last length elements from the audio transcript.
Default value = 0, gives the complete transcript
"""
combined_transcript = list(merge(
self.transcript_data["You"], self.transcript_data["Speaker"],
key=lambda x: x[1], reverse=False))
combined_transcript = combined_transcript[-length:]
return "".join([t[0] for t in combined_transcript])

def clear_transcript_data(self):
"""
Args:
length: Clear all data stored internally for audio transcript
"""
self.transcript_data["You"].clear()
self.transcript_data["Speaker"].clear()

Expand Down
6 changes: 3 additions & 3 deletions GPTResponder.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ def respond_to_transcriber(self, transcriber):
transcriber.transcript_changed_event.clear()
transcript_string = transcriber.get_transcript(length=MAX_PHRASES)
response = generate_response_from_transcript(transcript_string)

end_time = time.time() # Measure end time
execution_time = end_time - start_time # Calculate the time it took to execute the function
execution_time = end_time - start_time # Calculate time to execute the function

if response != '':
self.response = response

Expand Down
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ While Transcribe provides real-time transcription and optional response suggesti

**Whisper Model**: If the --api flag is not used, we utilize the 'tiny' version of the Whisper ASR model, due to its low resource consumption and fast response times. However, this model may not be as accurate as the larger models in transcribing certain types of speech, including accents or uncommon words.

**OpenAI Account**: If a paid OpenAI account with a valid Open API Key is not used, the command window displays the following error message repeatedly, though the application behvaior is not impacted in any way
**OpenAI Account**: If a paid OpenAI account with a valid Open API Key is not used, the command window displays the following error message repeatedly, though the application behvaior is not impacted in any way.
```
Incorrect API key provided: API_KEY. You can find your API key at https://platform.openai.com/account/api-keys.
```
Expand All @@ -102,8 +102,9 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file

## ➕ Enhancements from base repository ➕
- Do not need Open AI key, paid Open AI account to use the complete functionality
- Allow contexttual information to provide customized responses to users
- Transcribe any video
- Allow users to add contextual information to provide customized responses to conversation
- Allow to pause audio transcription
- Transcribe audio of any video
- Preserve all conversation text in UI
- Allow saving conversation to file

Expand Down
47 changes: 30 additions & 17 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from argparse import RawTextHelpFormatter
from AudioTranscriber import AudioTranscriber
from GPTResponder import GPTResponder
import prompts
import customtkinter as ctk
import AudioRecorder
import queue
Expand All @@ -25,7 +26,7 @@ def update_transcript_UI(transcriber, textbox):
textbox.after(300, update_transcript_UI, transcriber, textbox)


def update_response_UI(responder, textbox, update_interval_slider_label,
def update_response_UI(responder, textbox, update_interval_slider_label,
update_interval_slider, freeze_state):
if not freeze_state[0]:
response = responder.response
Expand Down Expand Up @@ -66,8 +67,9 @@ def create_ui_components(root):
response_textbox = ctk.CTkTextbox(root, width=300, font=("Arial", font_size),
text_color='#639cdc', wrap="word")
response_textbox.grid(row=0, column=1, padx=10, pady=20, sticky="nsew")
response_textbox.insert("0.0", prompts.INITIAL_RESPONSE)

freeze_button = ctk.CTkButton(root, text="Freeze", command=None)
freeze_button = ctk.CTkButton(root, text="Suggest Response", command=None)
freeze_button.grid(row=1, column=1, padx=10, pady=3, sticky="nsew")

update_interval_slider_label = ctk.CTkLabel(root, text="", font=("Arial", 12),
Expand All @@ -82,14 +84,17 @@ def create_ui_components(root):
copy_button = ctk.CTkButton(root, text="Copy Audio Transcript", command=None)
copy_button.grid(row=2, column=0, padx=10, pady=3, sticky="nsew")

save_file_button = ctk.CTkButton(root, text="Save to File", command=None)
save_file_button = ctk.CTkButton(root, text="Save Audio Transcript to File", command=None)
save_file_button.grid(row=3, column=0, padx=10, pady=3, sticky="nsew")

transcript_button = ctk.CTkButton(root, text="Pause Transcript", command=None)
transcript_button.grid(row=4, column=0, padx=10, pady=3, sticky="nsew")

# Order of returned components is important. For adding new components add new components
# to the end
return [transcript_textbox, response_textbox, update_interval_slider,
update_interval_slider_label, freeze_button, copy_button,
save_file_button]
save_file_button, transcript_button]


def main():
Expand All @@ -100,7 +105,8 @@ def main():
cmd_args.add_argument('-a', '--api', action='store_true',
help='Use the online Open AI API for transcription.\
\nThis option requires an API KEY and will consume Open AI credits.')
cmd_args.add_argument('-m', '--model', action='store', choices=['tiny', 'base', 'small'], default='tiny',
cmd_args.add_argument('-m', '--model', action='store', choices=['tiny', 'base', 'small'],
default='tiny',
help='Specify the model to use for transcription.'
'\nBy default tiny model is part of the install.'
'\nbase model has to be downloaded from the link https://drive.google.com/file/d/1E44DVjpfZX8tSrSagaDJXU91caZOkwa6/view?usp=drive_link'
Expand All @@ -127,6 +133,7 @@ def main():
freeze_button = ui_components[4]
copy_button = ui_components[5]
save_file_button = ui_components[6]
transcript_button = ui_components[7]

audio_queue = queue.Queue()

Expand All @@ -142,16 +149,16 @@ def main():
# Transcribe and Respond threads, both work on the same instance of the AudioTranscriber class
transcriber = AudioTranscriber(user_audio_recorder.source,
speaker_audio_recorder.source, model)
transcribe = threading.Thread(target=transcriber.transcribe_audio_queue,
args=(audio_queue,))
transcribe.daemon = True
transcribe.start()
transcribe_thread = threading.Thread(target=transcriber.transcribe_audio_queue,
args=(audio_queue,))
transcribe_thread.daemon = True
transcribe_thread.start()

responder = GPTResponder()
respond = threading.Thread(target=responder.respond_to_transcriber,
args=(transcriber,))
respond.daemon = True
respond.start()
respond_thread = threading.Thread(target=responder.respond_to_transcriber,
args=(transcriber,))
respond_thread.daemon = True
respond_thread.start()

print("READY")

Expand All @@ -163,15 +170,15 @@ def main():
root.grid_columnconfigure(1, weight=1)

# Add the clear transcript button to the UI
clear_transcript_button = ctk.CTkButton(root, text="Clear Transcript",
clear_transcript_button = ctk.CTkButton(root, text="Clear Audio Transcript",
command=lambda: clear_context(transcriber, audio_queue))
clear_transcript_button.grid(row=1, column=0, padx=10, pady=3, sticky="nsew")

freeze_state = [False] # Using list to be able to change its content inside inner functions
freeze_state = [True] # Using list to be able to change its content inside inner functions

def freeze_unfreeze():
freeze_state[0] = not freeze_state[0] # Invert the freeze state
freeze_button.configure(text="Unfreeze" if freeze_state[0] else "Freeze")
freeze_state[0] = not freeze_state[0] # Invert the state
freeze_button.configure(text="Suggest Response" if freeze_state[0] else "Do Not Suggest Response")

freeze_button.configure(command=freeze_unfreeze)

Expand All @@ -187,6 +194,12 @@ def save_file():

save_file_button.configure(command=save_file)

def set_transcript_state():
transcriber.transcribe = not transcriber.transcribe
transcript_button.configure(text="Pause Transcript" if transcriber.transcribe else "Start Transcript")

transcript_button.configure(command=set_transcript_state)

update_interval_slider_label.configure(text=f"Update interval: \
{update_interval_slider.get()} \
seconds")
Expand Down