Merge pull request netease-youdao#90 from john9405/main

feat: Add the ability to adjust voice speed in the 'OpenAI-compatible-TTS API', thanks to @john9405.
wittech · Jan 2, 2024 · 2c0d2e0 · 2c0d2e0
2 parents 9df9d9c + 176d8ed
commit 2c0d2e0
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 4 deletions.
diff --git a/openaiapi.py b/openaiapi.py
@@ -13,6 +13,7 @@
 from transformers import AutoTokenizer
 import numpy as np
 import soundfile as sf
+import pyrubberband as pyrb
 from pydub import AudioSegment
 from yacs import config as CONFIG
 from config.joint.config import Config
@@ -165,15 +166,17 @@ def text_to_speech(speechRequest: SpeechRequest):
  np_audio = emotivoice_tts(text, speechRequest.prompt,
  speechRequest.input, speechRequest.voice,
  models)
+ y_stretch = np_audio
+ if speechRequest.speed != 1.0:
+ y_stretch = pyrb.time_stretch(np_audio, config.sampling_rate, speechRequest.speed)
  wav_buffer = io.BytesIO()
- sf.write(file=wav_buffer, data=np_audio,
+ sf.write(file=wav_buffer, data=y_stretch,
  samplerate=config.sampling_rate, format='WAV')
  buffer = wav_buffer
  response_format = speechRequest.response_format
  if response_format != 'wav':
- wav_audio = AudioSegment(
- wav_buffer.getvalue(), frame_rate=config.sampling_rate,
- sample_width=2, channels=1)
+ wav_audio = AudioSegment.from_wav(wav_buffer)
+ wav_audio.frame_rate=config.sampling_rate
  buffer = io.BytesIO()
  wav_audio.export(buffer, format=response_format)
 

diff --git a/requirements.openaiapi.txt b/requirements.openaiapi.txt
@@ -2,3 +2,4 @@ fastapi
 python-multipart
 uvicorn[standard]
 pydub
+pyrubberband