Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Continuous mode not working #148

Closed
wants to merge 40 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
9fde685
Update README.md for Transcribe
vivekuppal Jun 29, 2023
1483fea
Merge pull request #1 from vivekuppal/vu-readme-updates
vivekuppal Jun 29, 2023
391d728
Allow usage without a valid OPEN API key. (#2)
vivekuppal Jun 29, 2023
ab4245d
Update README.md (#3)
vivekuppal Jun 29, 2023
ebb6f2f
Allow user to choose model. Add arguments to main file.
vivekuppal Jun 29, 2023
8f5a595
Code clean up, add linting. (#4)
vivekuppal Jun 30, 2023
59d5c91
UI Text Chronology (#5)
vivekuppal Jun 30, 2023
f772bb8
Update readme with Enhancements. Allow copy of text from UI window. R…
vivekuppal Jun 30, 2023
87a38b1
Save conversation to text. (#9)
vivekuppal Jun 30, 2023
65d6dcf
Add Contextual Information to Responses (#11)
vivekuppal Jun 30, 2023
d1b3c45
Allow users to pause audio transcription. Change the default for gett…
vivekuppal Jul 3, 2023
cfca51a
Update main.py (#15)
abhinavuppal1 Jul 11, 2023
152bad3
Code reorg to separate UI code (#16)
vivekuppal Jul 12, 2023
addf17f
Add support for multiple languages (#18)
vivekuppal Jul 12, 2023
e5cda88
Easy install for non developers on windows (#20)
vivekuppal Jul 18, 2023
9896c1c
Disabled winrar UI (#22)
Adarsha-gg Jul 18, 2023
901501b
When using API, we do not need to specify language, absorb the lang p…
vivekuppal Jul 18, 2023
bd48b61
Language combo fix (#26)
Adarsha-gg Jul 19, 2023
7c9ca88
Added gdrive (#27)
Adarsha-gg Jul 19, 2023
2429c97
Allow usage of API Key in installed version of Transcribe (#28)
vivekuppal Jul 19, 2023
12ef846
updated the drive link (#30)
Adarsha-gg Jul 20, 2023
4be26c7
Add a duration class to easily measure the time taken for an operatio…
vivekuppal Jul 21, 2023
6e53b31
--api option was not working correctly (#34)
vivekuppal Jul 21, 2023
bd42b8c
Initial unit tests for the speech recognition library (#36)
vivekuppal Jul 24, 2023
af87eff
user reported defect fixes. (#39)
vivekuppal Jul 26, 2023
26cfaad
Optimize LLM usage (#40)
vivekuppal Jul 26, 2023
f8d5857
Bug fixes for exceptions observed during usage. Add further plumbing …
vivekuppal Jul 27, 2023
1356a78
Add logging infrastructure (#42)
vivekuppal Jul 27, 2023
a1cc48b
Get Response from LLM on demand (#44)
vivekuppal Jul 28, 2023
ea5f392
Models from open ai site (#43)
Adarsha-gg Jul 28, 2023
b4e03a4
List all active devices (#45)
vivekuppal Aug 1, 2023
85d09ed
Allow user to select input, output audio devices (#48)
vivekuppal Aug 21, 2023
28d1e9a
Disable mic speaker selectively (#49)
vivekuppal Aug 23, 2023
e48bdb8
Add Audio Response for LLM generated content (#50)
vivekuppal Aug 27, 2023
6baa77f
Update, upload latest binaries (#54)
Adarsha-gg Aug 30, 2023
fa55416
Multiturn prompts, bug fixes (#55)
vivekuppal Sep 5, 2023
ce5a1e1
Allow enable/disable speaker and microphone from UI (#56)
Adarsha-gg Sep 6, 2023
e445856
Update gdrive link (#58)
Adarsha-gg Sep 7, 2023
b50f58c
Bring readme up to date with current functionality. Describe content …
vivekuppal Sep 8, 2023
a7ea2cc
Continuous mode broke after updates to the UI.
vivekuppal Sep 8, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add Audio Response for LLM generated content (#50)
* Add text to speech. Parse OpenAI responses better than before
  • Loading branch information
vivekuppal committed Aug 27, 2023
commit e48bdb813414078f615355d304789cb8dccda1f8
4 changes: 1 addition & 3 deletions AudioRecorder.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,9 +174,7 @@ def set_device(self, index: int):
)
self.source = source
print(f'[INFO] Listening to sound from Microphone: {self.get_name()} ')
# This line is commented because in case of non default microphone it can occasionally take
# several minutes to execute, thus delaying the start of the application.
# self.adjust_for_noise("Default Mic", "Please make some noise from the Default Mic...")
self.adjust_for_noise("Mic", "Please make some noise from the chosen Mic...")


class SpeakerRecorder(BaseRecorder):
Expand Down
19 changes: 0 additions & 19 deletions CustomPrompts.py

This file was deleted.

59 changes: 43 additions & 16 deletions GPTResponder.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import datetime
import time
import pprint
import openai
import GlobalVars
import prompts
import openai
import conversation
import constants
import configuration
Expand Down Expand Up @@ -30,14 +31,12 @@ def generate_response_from_transcript_no_check(self, transcript) -> str:
Gets a response even if the continuous suggestion option is disabled.
"""
try:
# prompt_content = create_prompt(transcript)
# prompt_api_message = [{"role": "system", "content": prompt_content}]
prompt_api_message = prompts.create_single_turn_prompt_message(transcript)
multiturn_prompt_content = self.conversation.get_merged_conversation(
length=constants.MAX_TRANSCRIPTION_PHRASES_FOR_LLM)
multiturn_prompt_api_message = prompts.create_multiturn_prompt(multiturn_prompt_content)
# print(f'Usual prompt api message: {prompt_api_message}')
# print(f'Multiturn prompt: {multiturn_prompt_api_message}')
# pprint.pprint(f'Prompt api message: {prompt_api_message}')
# print(f'Multiturn prompt for ChatGPT: {multiturn_prompt_api_message}')
usual_response = openai.ChatCompletion.create(
model=self.model,
messages=prompt_api_message,
Expand All @@ -51,13 +50,32 @@ def generate_response_from_transcript_no_check(self, transcript) -> str:
return prompts.INITIAL_RESPONSE

usual_full_response = usual_response.choices[0].message.content
# pprint.pprint(f'Prompt api response: {usual_response}')
try:
return usual_full_response.split('[')[1].split(']')[0]
# The original way of processing the response. It used to cause issues when there
# were multiple questions in the transcript.
# response = usual_full_response.split('[')[1].split(']')[0]
processed_response = self.process_response(usual_full_response)
self.update_conversation(persona=constants.PERSONA_ASSISTANT, response=processed_response)
return processed_response
except Exception as exception:
root_logger.error('Error parsing response from LLM.')
root_logger.exception(exception)
return prompts.INITIAL_RESPONSE

def process_response(self, input_str: str) -> str:
lines = input_str.split(sep='\n')
response = ''
for line in lines:
# Skip any responses that contain content like
# Speaker 1: <Some statement>
# This is generated content added by OpenAI that can be skipped
if 'Speaker' in line and ':' in line:
continue
response = response + line.strip().strip('[').strip(']')

return response

def generate_response_from_transcript(self, transcript):
"""Ping OpenAI LLM model to get response from the Assistant
"""
Expand All @@ -67,6 +85,13 @@ def generate_response_from_transcript(self, transcript):

return self.generate_response_from_transcript_no_check(transcript)

def update_conversation(self, response, persona):
if response != '':
self.response = response
self.conversation.update_conversation(persona=persona,
text=response,
time_spoken=datetime.datetime.now())

def respond_to_transcriber(self, transcriber):
"""Thread method to continously update the transcript
"""
Expand All @@ -76,27 +101,29 @@ def respond_to_transcriber(self, transcriber):
start_time = time.time()

transcriber.transcript_changed_event.clear()
transcript_string = transcriber.get_transcript(
length=constants.MAX_TRANSCRIPTION_PHRASES_FOR_LLM)
response = self.generate_response_from_transcript(transcript_string)
response = ''

# Do processing only if LLM transcription is enabled
if not self.gl_vars.freeze_state[0]:
transcript_string = transcriber.get_transcript(
length=constants.MAX_TRANSCRIPTION_PHRASES_FOR_LLM)
response = self.generate_response_from_transcript(transcript_string)

end_time = time.time() # Measure end time
execution_time = end_time - start_time # Calculate time to execute the function

if response != '':
self.response = response
self.conversation.update_conversation(persona=constants.PERSONA_ASSISTANT,
text=response,
time_spoken=datetime.datetime.now())

remaining_time = self.response_interval - execution_time
if remaining_time > 0:
time.sleep(remaining_time)
else:
time.sleep(0.3)
time.sleep(self.response_interval)

def update_response_interval(self, interval):
"""Change the interval for pinging LLM
"""
root_logger.info(GPTResponder.update_response_interval.__name__)
self.response_interval = interval


if __name__ == "__main__":
print('GPTResponder')
2 changes: 2 additions & 0 deletions GlobalVars.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import tkinter as tk
import customtkinter as ctk
from AudioTranscriber import AudioTranscriber
from audio_player import AudioPlayer
import AudioRecorder
import Singleton
import app_logging as al
Expand All @@ -17,6 +18,7 @@ class TranscriptionGlobals(Singleton.Singleton):
audio_queue: queue.Queue = None
user_audio_recorder: AudioRecorder.MicRecorder = None
speaker_audio_recorder: AudioRecorder.SpeakerRecorder = None
audio_player: AudioPlayer = None
# Global for transcription from speaker, microphone
transcriber: AudioTranscriber = None
# Global for responses from openAI API
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ Incorrect API key provided: API_KEY. You can find your API key at https://platfo
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.

## ➕ Enhancements from base repository ➕
- Speech Mode - Read out responses from ChatGPT as Audio
- Do not need Open AI key, paid Open AI account to use the complete functionality
- Allow users selective disabling of mic, speaker audio input
- Allow users to add contextual information to provide customized responses to conversation
Expand Down
2 changes: 1 addition & 1 deletion TranscriberModels.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def load_model(self):

class APIWhisperTranscriber:
def __init__(self):
print('Using Open AI API for transcription.')
print('[INFO] Using Open AI API for transcription.')
openai.api_key = GlobalVars.TranscriptionGlobals().api_key
# lang parameter is not required for API invocation. This exists solely
# to support --api option from command line.
Expand Down
64 changes: 64 additions & 0 deletions audio_player.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""Plays the responses received from LLM as Audio
This class does text to speech
"""

import os
import time
import tempfile
import threading
import playsound
import gtts
import app_logging as al
import conversation
import constants


root_logger = al.get_logger()


class AudioPlayer:
"""Play text to audio
"""
def __init__(self, convo: conversation):
root_logger.info(AudioPlayer.__name__)
self.speech_text_available = threading.Event()
self.conversation = convo
self.temp_dir = tempfile.gettempdir()

def play_audio(self, speech: str):
"""Play text to audio.
This is a blocking method and will return when audio playback is complete.
For large audio text, this could be several minutes.
"""
root_logger.info(AudioPlayer.__name__)
audio_obj = gtts.gTTS(speech)
temp_audio_file = tempfile.mkstemp(dir=self.temp_dir, suffix='.mp3')
os.close(temp_audio_file[0])

audio_obj.save(temp_audio_file[1])
try:
playsound.playsound(temp_audio_file[1])
except playsound.PlaysoundException as play_ex:
print('Error when attempting to play audio.')
print(play_ex)

os.remove(temp_audio_file[1])

def play_audio_loop(self):
"""Play text to audio continuously based on the signaling of event
"""
while True:
if self.speech_text_available.is_set():
self.speech_text_available.clear()
speech = self.conversation.get_conversation(
sources=[constants.PERSONA_ASSISTANT], length=1)
# Speech text is in the format
# f"{persona}: [{text}]\n\n"
# Remove persona
final_speech = speech[len(constants.PERSONA_ASSISTANT)+2:]
# Remove whitespace
final_speech = final_speech.strip()
# Remove Square brackets
final_speech = final_speech[1:-1]
self.play_audio(speech=final_speech)
time.sleep(0.1)
4 changes: 2 additions & 2 deletions constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
"""

PERSONA_YOU = 'You'
PERSONA_ASSISTANT = 'Assistant'
PERSONA_SYSTEM = 'System'
PERSONA_ASSISTANT = 'assistant'
PERSONA_SYSTEM = 'system'
PERSONA_SPEAKER = 'Speaker'

LOG_NAME = 'Transcribe'
Expand Down
32 changes: 18 additions & 14 deletions conversation.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
from heapq import merge
import datetime
# import pprint
import constants
import configuration
import datetime

DEFAULT_PREAMBLE = """You are a casual pal, genuinely interested in the conversation at hand.""" \
"""Please respond, in detail, to the conversation. Confidently give a """\
"""straightforward response to the speaker, even if you don't understand """\
"""them. Give your response in square brackets. DO NOT ask to repeat, """\
"""and DO NOT ask for clarification. Just answer the speaker directly."""\
"""A poor transcription of conversation is given below."""


class Conversation:
Expand All @@ -21,9 +15,17 @@ def __init__(self):
constants.PERSONA_YOU: [],
constants.PERSONA_SPEAKER: [],
constants.PERSONA_ASSISTANT: []}
transcript = self.transcript_data[constants.PERSONA_SYSTEM]
transcript.append((f"{constants.PERSONA_SYSTEM}: [{DEFAULT_PREAMBLE}]\n\n", datetime.datetime.now()))
config = configuration.Config().get_data()
prompt = config["OpenAI"]["system_prompt"]
self.update_conversation(persona=constants.PERSONA_SYSTEM, text=prompt,
time_spoken=datetime.datetime.now())
initial_convo: dict = config["OpenAI"]["initial_convo"]
# Read the initial conversation from parameters.yaml file and add to the convo
for _, value in initial_convo.items():
role = value['role']
content = value['content']
self.update_conversation(persona=role, text=content,
time_spoken=datetime.datetime.now())

def clear_conversation_data(self):
"""Clear all conversation data
Expand All @@ -47,12 +49,12 @@ def update_conversation(self, persona: str, text: str, time_spoken, pop: bool =

def get_conversation(self,
sources: list = None,
length: int = 0):
"""Get the complete transcript
length: int = 0) -> list:
"""Get the transcript based on specified sources
Args:
sources: Get data from which sources (You, Speaker, Assistant, System)
length: Get the last length elements from the audio transcript.
Default value = 0, gives the complete transcript
Default value = 0, gives the complete transcript for chosen sources
"""
if sources is None:
sources = [constants.PERSONA_YOU,
Expand All @@ -72,19 +74,21 @@ def get_conversation(self,
def get_merged_conversation(self, length: int = 0) -> list:
"""Creates a prompt to be sent to LLM (OpenAI by default)
length: Get the last length elements from the audio transcript.
Initial system prompt is always part of the return value
Default value = 0, gives the complete transcript
"""
# print(f'You: Length: {len(self.transcript_data[constants.PERSONA_YOU])}')
# print(f'Speaker: Length: {len(self.transcript_data[constants.PERSONA_SPEAKER])}')
# print(f'Assistant: Length: {len(self.transcript_data[constants.PERSONA_ASSISTANT])}')
# print(f'System: Length: {len(self.transcript_data[constants.PERSONA_SYSTEM])}')

combined_transcript = self.transcript_data[constants.PERSONA_SYSTEM]
combined_transcript = list(merge(
combined_transcript,
self.transcript_data[constants.PERSONA_YOU][-length:],
self.transcript_data[constants.PERSONA_SPEAKER][-length:],
self.transcript_data[constants.PERSONA_ASSISTANT][-length:],
key=lambda x: x[1]))
combined_transcript = combined_transcript[-length:]

combined_transcript.insert(0, (f"{constants.PERSONA_SYSTEM}: [{self.transcript_data[constants.PERSONA_SYSTEM][0]}]\n\n", datetime.datetime.now()))
return combined_transcript
Loading