fix

jdixosnd · Mar 11, 2023 · 361fd95 · 361fd95
1 parent 1ab52be
commit 361fd95
Show file tree

Hide file tree

Showing 2 changed files with 68 additions and 68 deletions.
diff --git a/nodes/text2speech/tests/samples/answer.wav b/nodes/text2speech/tests/samples/answer.wav
diff --git a/nodes/text2speech/tests/test_nodes.py b/nodes/text2speech/tests/test_nodes.py
@@ -2,17 +2,25 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 import os
+import platform
 from pathlib import Path
 
 import pytest
 import numpy as np
-import soundfile as sf
-from haystack.schema import Span, Answer, Document
+
+try:
+ import soundfile as sf
+ import ffmpeg
+
+ soundfile_not_found = False
+except:
+ soundfile_not_found = True
+
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
-import ffmpeg
 
-from text2speech import AnswerToSpeech, DocumentToSpeech
-from text2speech.utils import TextToSpeech
+from haystack.schema import Span, Answer, SpeechAnswer, Document, SpeechDocument
+from haystack.nodes.audio import AnswerToSpeech, DocumentToSpeech
+from haystack.nodes.audio._text_to_speech import TextToSpeech
 
 
 
@@ -60,60 +68,64 @@ def test_text_to_speech_audio_data(self, tmp_path, whisper_helper: WhisperHelper
  samplerate=text2speech.model.fs,
  )
 
- expedtec_doc = whisper_helper.transcribe(str(SAMPLES_PATH / "audio" / "answer.wav"))
+ expected_doc = whisper_helper.transcribe(str(SAMPLES_PATH / "answer.wav"))
  generated_doc = whisper_helper.transcribe(str(tmp_path / "audio1.wav"))
 
- assert expedtec_doc == generated_doc
+ assert expected_doc[0] in generated_doc[0]
 
- def test_text_to_speech_audio_file(self, tmp_path):
+ def test_text_to_speech_audio_file(self, tmp_path, whisper_helper: WhisperHelper):
  text2speech = TextToSpeech(
  model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
- transformers_params={"seed": 777, "always_fix_seed": True},
- )
- expected_audio_data, _ = sf.read(SAMPLES_PATH / "answer.wav")
- audio_file = text2speech.text_to_audio_file(
- text="answer", generated_audio_dir=tmp_path / "test_audio"
+ transformers_params={"seed": 4535, "always_fix_seed": True},
  )
+
+ audio_file = text2speech.text_to_audio_file(text="answer", generated_audio_dir=tmp_path / "test_audio")
  assert os.path.exists(audio_file)
- assert np.allclose(expected_audio_data, sf.read(audio_file)[0], atol=0.001)
 
- def test_text_to_speech_compress_audio(self, tmp_path):
+ expected_doc = whisper_helper.transcribe(str(SAMPLES_PATH / "answer.wav"))
+ generated_doc = whisper_helper.transcribe(str(audio_file))
+
+ assert expected_doc[0] in generated_doc[0]
+
+ @pytest.mark.skipif(platform.system() == "Darwin", reason="MP3 compression not working on M1")
+ def test_text_to_speech_compress_audio(self, tmp_path, whisper_helper: WhisperHelper):
  text2speech = TextToSpeech(
  model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
- transformers_params={"seed": 777, "always_fix_seed": True},
+ transformers_params={"seed": 4535, "always_fix_seed": True},
  )
  expected_audio_file = SAMPLES_PATH / "answer.wav"
  audio_file = text2speech.text_to_audio_file(
- text="answer",
- generated_audio_dir=tmp_path / "test_audio",
- audio_format="mp3",
+ text="answer", generated_audio_dir=tmp_path / "test_audio", audio_format="mp3"
  )
  assert os.path.exists(audio_file)
  assert audio_file.suffix == ".mp3"
- # FIXME find a way to make sure the compressed audio is similar enough to the wav version.
- # At a manual inspection, the code seems to be working well.
 
- def test_text_to_speech_naming_function(self, tmp_path):
+ expected_doc = whisper_helper.transcribe(str(expected_audio_file))
+ generated_doc = whisper_helper.transcribe(str(audio_file))
+
+ assert expected_doc[0] in generated_doc[0]
+
+ def test_text_to_speech_naming_function(self, tmp_path, whisper_helper: WhisperHelper):
  text2speech = TextToSpeech(
  model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
- transformers_params={"seed": 777, "always_fix_seed": True},
+ transformers_params={"seed": 4535, "always_fix_seed": True},
  )
  expected_audio_file = SAMPLES_PATH / "answer.wav"
  audio_file = text2speech.text_to_audio_file(
- text="answer",
- generated_audio_dir=tmp_path / "test_audio",
- audio_naming_function=lambda text: text,
+ text="answer", generated_audio_dir=tmp_path / "test_audio", audio_naming_function=lambda text: text
  )
  assert os.path.exists(audio_file)
  assert audio_file.name == expected_audio_file.name
- assert np.allclose(
- sf.read(expected_audio_file)[0], sf.read(audio_file)[0], atol=0.001
- )
+
+ expected_doc = whisper_helper.transcribe(str(expected_audio_file))
+ generated_doc = whisper_helper.transcribe(str(audio_file))
+
+ assert expected_doc[0] in generated_doc[0]
 
 
 @pytest.mark.integration
 class TestAnswerToSpeech:
- def test_answer_to_speech(self, tmp_path):
+ def test_answer_to_speech(self, tmp_path, whisper_helper: WhisperHelper):
  text_answer = Answer(
  answer="answer",
  type="extractive",
@@ -123,70 +135,58 @@ def test_answer_to_speech(self, tmp_path):
  meta={"some_meta": "some_value"},
  )
  expected_audio_answer = SAMPLES_PATH / "answer.wav"
- expected_audio_context = (
- SAMPLES_PATH / "the context for this answer is here.wav"
- )
+ expected_audio_context = SAMPLES_PATH / "the context for this answer is here.wav"
 
  answer2speech = AnswerToSpeech(
  generated_audio_dir=tmp_path / "test_audio",
  audio_params={"audio_naming_function": lambda text: text},
- transformers_params={"seed": 777, "always_fix_seed": True},
+ transformers_params={"seed": 4535, "always_fix_seed": True},
  )
  results, _ = answer2speech.run(answers=[text_answer])
 
- audio_answer: Answer = results["answers"][0]
- assert isinstance(audio_answer, Answer)
- assert audio_answer.meta["audio"]["answer"]["path"] == expected_audio_answer
- assert audio_answer["audio"]["context"]["path"] == expected_audio_context
+ audio_answer: SpeechAnswer = results["answers"][0]
+ assert isinstance(audio_answer, SpeechAnswer)
+ assert audio_answer.type == "generative"
+ assert audio_answer.answer_audio.name == expected_audio_answer.name
+ assert audio_answer.context_audio.name == expected_audio_context.name
  assert audio_answer.answer == "answer"
  assert audio_answer.context == "the context for this answer is here"
  assert audio_answer.offsets_in_document == [Span(31, 37)]
  assert audio_answer.offsets_in_context == [Span(21, 27)]
  assert audio_answer.meta["some_meta"] == "some_value"
- assert audio_answer.meta["audio"]["answer"]["path"] == "wav"
- assert audio_answer.meta["audio"]["context"]["path"] == "wav"
+ assert audio_answer.meta["audio_format"] == "wav"
 
- assert np.allclose(
- sf.read(audio_answer.answer_audio)[0],
- sf.read(expected_audio_answer)[0],
- atol=0.001,
- )
- assert np.allclose(
- sf.read(audio_answer.context_audio)[0],
- sf.read(expected_audio_context)[0],
- atol=0.001,
- )
+ expected_doc = whisper_helper.transcribe(str(expected_audio_answer))
+ generated_doc = whisper_helper.transcribe(str(audio_answer.answer_audio))
+
+ assert expected_doc[0] in generated_doc[0]
 
 
 @pytest.mark.integration
 class TestDocumentToSpeech:
- def test_document_to_speech(self, tmp_path):
+ def test_document_to_speech(self, tmp_path, whisper_helper: WhisperHelper):
  text_doc = Document(
- content="this is the content of the document",
- content_type="text",
- meta={"name": "test_document.txt"},
- )
- expected_audio_content = (
- SAMPLES_PATH / "this is the content of the document.wav"
+ content="this is the content of the document", content_type="text", meta={"name": "test_document.txt"}
  )
+ expected_audio_content = SAMPLES_PATH / "this is the content of the document.wav"
 
  doc2speech = DocumentToSpeech(
  generated_audio_dir=tmp_path / "test_audio",
  audio_params={"audio_naming_function": lambda text: text},
- transformers_params={"seed": 777, "always_fix_seed": True},
+ transformers_params={"seed": 4535, "always_fix_seed": True},
  )
+
  results, _ = doc2speech.run(documents=[text_doc])
 
- audio_doc: Document = results["documents"][0]
- assert isinstance(audio_doc, Document)
- assert audio_doc.content_type == "text"
- assert audio_doc.meta["audio"]["content"]["path"] == expected_audio_content
+ audio_doc: SpeechDocument = results["documents"][0]
+ assert isinstance(audio_doc, SpeechDocument)
+ assert audio_doc.content_type == "audio"
+ assert audio_doc.content_audio.name == expected_audio_content.name
  assert audio_doc.content == "this is the content of the document"
  assert audio_doc.meta["name"] == "test_document.txt"
- assert audio_doc.meta["audio"]["content"]["format"] == "wav"
+ assert audio_doc.meta["audio_format"] == "wav"
 
- assert np.allclose(
- sf.read(audio_doc.content_audio)[0],
- sf.read(expected_audio_content)[0],
- atol=0.001,
- )
+ expected_doc = whisper_helper.transcribe(str(expected_audio_content))
+ generated_doc = whisper_helper.transcribe(str(audio_doc.content_audio))
+
+ assert expected_doc[0] in generated_doc[0]