forked from DougDougGithub/ChatGodApp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
azure_text_to_speech.py
117 lines (102 loc) · 4.33 KB
/
azure_text_to_speech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
import random
import azure.cognitiveservices.speech as speechsdk
from gtts import gTTS
from pydub import AudioSegment
import pygame
AZURE_VOICES = [
"en-US-DavisNeural",
"en-US-TonyNeural",
"en-US-JasonNeural",
"en-US-GuyNeural",
"en-US-JaneNeural",
"en-US-NancyNeural",
"en-US-JennyNeural",
"en-US-AriaNeural",
]
AZURE_VOICE_STYLES = [
# Currently using the 9 of the 11 available voice styles
# Note that certain styles aren't available on all voices
"angry",
"cheerful",
"excited",
"hopeful",
"sad",
"shouting",
"terrified",
"unfriendly",
"whispering"
]
AZURE_PREFIXES = {
"(angry)" : "angry",
"(cheerful)" : "cheerful",
"(excited)" : "excited",
"(hopeful)" : "hopeful",
"(sad)" : "sad",
"(shouting)" : "shouting",
"(shout)" : "shouting",
"(terrified)" : "terrified",
"(unfriendly)" : "unfriendly",
"(whispering)" : "whispering",
"(whisper)" : "whispering",
"(random)" : "random"
}
class AzureTTSManager:
azure_speechconfig = None
azure_synthesizer = None
def __init__(self):
pygame.init()
# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and service region (e.g., "westus").
self.azure_speechconfig = speechsdk.SpeechConfig(subscription=os.getenv('AZURE_TTS_KEY'), region=os.getenv('AZURE_TTS_REGION'))
# Set the voice name, refer to https://aka.ms/speech/voices/neural for full list.
self.azure_speechconfig.speech_synthesis_voice_name = "en-US-AriaNeural"
# Creates a speech synthesizer. Setting audio_config to None means it wont play the synthesized text out loud.
self.azure_synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.azure_speechconfig, audio_config=None)
# Returns the path to the new .wav file
def text_to_audio(self, text: str, voice_name="random", voice_style="random"):
if voice_name == "random":
voice_name = random.choice(AZURE_VOICES)
if voice_style == "random":
voice_style = random.choice(AZURE_VOICE_STYLES)
# Change the voice style if the message includes a prefix
text = text.lower()
if text.startswith("(") and ")" in text:
prefix = text[0:(text.find(")")+1)]
if prefix in AZURE_PREFIXES:
voice_style = AZURE_PREFIXES[prefix]
text = text.removeprefix(prefix)
if len(text) == 0:
print("This message was empty")
return
if voice_style == "random":
voice_style = random.choice(AZURE_VOICE_STYLES)
ssml_text = f"<speak version='1.0' xmlns='https://www.w3.org/2001/10/synthesis' xmlns:mstts='https://www.w3.org/2001/mstts' xmlns:emo='https://www.w3.org/2009/10/emotionml' xml:lang='en-US'><voice name='{voice_name}'><mstts:express-as style='{voice_style}'>{text}</mstts:express-as></voice></speak>"
result = self.azure_synthesizer.speak_ssml_async(ssml_text).get()
output = os.path.join(os.path.abspath(os.curdir), f"_Msg{str(hash(text))}{str(hash(voice_name))}{str(hash(voice_style))}.wav")
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
stream = speechsdk.AudioDataStream(result)
stream.save_to_wav_file(output)
else:
# If Azure fails, use gTTS instead. gTTS saves as an mp3 by default, so convert it to a wav file after
print("\n Azure failed, using gTTS instead \n")
output_mp3 = output.replace(".wav", ".mp3")
msgAudio = gTTS(text=text, lang='en', slow=False)
msgAudio.save(output_mp3)
audiosegment = AudioSegment.from_mp3(output_mp3)
audiosegment.export(output, format="wav")
return output
# Tests here
if __name__ == '__main__':
tts_manager = AzureTTSManager()
pygame.mixer.init()
file_path = tts_manager.text_to_audio("Here's my test audio!!", "en-US-DavisNeural")
pygame.mixer.music.load(file_path)
pygame.mixer.music.play()
while True:
stuff_to_say = input("\nNext question? \n\n")
if len(stuff_to_say) == 0:
continue
file_path = tts_manager.text_to_audio(stuff_to_say)
pygame.mixer.music.load(file_path)
pygame.mixer.music.play()