From 7315e9ecdcf668419b88ead3976d939f5dfe143c Mon Sep 17 00:00:00 2001
From: moto <855818+mthrok@users.noreply.github.com>
Date: Thu, 4 Nov 2021 22:37:38 -0400
Subject: [PATCH 1/2] Embed audio samples in generated tutorials

It turned out that generated tutorials can embed the audio if the following conditions are met.
This commit changes how audio samples are shown in tutorials so that they become playable in doc.

1. There is only one `IPython.display.Audio` call in a cell
2. `IPython.display.Audio` is the last function called in the cell
3. Audio format is `wav`
   (`flac` can be contained, but browsers (Chrome/Safari) won't play it)

Ref: https://stackoverflow.com/a/33109647
---
 .../tts/tacotron2_pipeline_tutorial.py        |  6 +-
 .../wav2vec2/forced_alignment_tutorial.py     | 71 ++++++++++++++++---
 .../speech_recognition_pipeline_tutorial.py   |  4 +-
 3 files changed, 67 insertions(+), 14 deletions(-)

diff --git a/examples/gallery/tts/tacotron2_pipeline_tutorial.py b/examples/gallery/tts/tacotron2_pipeline_tutorial.py
index 3e14c6e78a..c45f8d8e89 100644
--- a/examples/gallery/tts/tacotron2_pipeline_tutorial.py
+++ b/examples/gallery/tts/tacotron2_pipeline_tutorial.py
@@ -270,7 +270,7 @@ def text_to_sequence(text):
 ax2.plot(waveforms[0].cpu().detach())
 
 torchaudio.save("output_wavernn.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate)
-IPython.display.display(IPython.display.Audio("output_wavernn.wav"))
+IPython.display.Audio("output_wavernn.wav")
 
 
 ######################################################################
@@ -299,7 +299,7 @@ def text_to_sequence(text):
 ax2.plot(waveforms[0].cpu().detach())
 
 torchaudio.save("output_griffinlim.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate)
-IPython.display.display(IPython.display.Audio("output_griffinlim.wav"))
+IPython.display.Audio("output_griffinlim.wav")
 
 
 ######################################################################
@@ -330,4 +330,4 @@ def text_to_sequence(text):
 ax2.plot(waveforms[0].cpu().detach())
 
 torchaudio.save("output_waveglow.wav", waveforms[0:1].cpu(), sample_rate=22050)
-IPython.display.display(IPython.display.Audio("output_waveglow.wav"))
+IPython.display.Audio("output_waveglow.wav")
diff --git a/examples/gallery/wav2vec2/forced_alignment_tutorial.py b/examples/gallery/wav2vec2/forced_alignment_tutorial.py
index 2ad3baae13..f43d83d833 100644
--- a/examples/gallery/wav2vec2/forced_alignment_tutorial.py
+++ b/examples/gallery/wav2vec2/forced_alignment_tutorial.py
@@ -56,8 +56,8 @@
 print(torchaudio.__version__)
 print(device)
 
-SPEECH_URL = 'https://download.pytorch.org/torchaudio/test-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.flac'
-SPEECH_FILE = 'speech.flac'
+SPEECH_URL = 'https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav'
+SPEECH_FILE = 'speech.wav'
 
 if not os.path.exists(SPEECH_FILE):
   with open(SPEECH_FILE, 'wb') as file:
@@ -422,18 +422,71 @@ def plot_alignments(trellis, segments, word_segments, waveform):
 plot_alignments(trellis, segments, word_segments, waveform[0],)
 plt.show()
 
-# Generate the audio for each segment
-print(transcript)
-IPython.display.display(IPython.display.Audio(SPEECH_FILE))
-ratio = waveform.size(1) / (trellis.size(0) - 1)
-for i, word in enumerate(word_segments):
+# A trick to embed the resulting audio to the generated file.
+# `IPython.display.Audio` has to be the last call in a cell,
+# and there should be only one call.
+def _show(i):
+  ratio = waveform.size(1) / (trellis.size(0) - 1)
+  word = word_segments[i]
   x0 = int(ratio * word.start)
   x1 = int(ratio * word.end)
   filename = f"{i}_{word.label}.wav"
   torchaudio.save(filename, waveform[:, x0:x1], bundle.sample_rate)
-  print(f"{word.label}: {x0 / bundle.sample_rate:.3f} - {x1 / bundle.sample_rate:.3f}")
-  IPython.display.display(IPython.display.Audio(filename))
+  print(f"{word.label} ({word.score:.2f}): {x0 / bundle.sample_rate:.3f} - {x1 / bundle.sample_rate:.3f} sec")
+  return filename
+
+######################################################################
+# 
+
+# Generate the audio for each segment
+print(transcript)
+IPython.display.Audio(SPEECH_FILE)
+
+
+######################################################################
+# 
+
+IPython.display.Audio(_show(0))
+
+######################################################################
+# 
+
+IPython.display.Audio(_show(1))
+
+######################################################################
+# 
+
+IPython.display.Audio(_show(2))
+
+######################################################################
+# 
+
+IPython.display.Audio(_show(3))
+
+######################################################################
+# 
+
+IPython.display.Audio(_show(4))
+
+######################################################################
+# 
+
+IPython.display.Audio(_show(5))
+
+######################################################################
+# 
+
+IPython.display.Audio(_show(6))
+
+######################################################################
+# 
+
+IPython.display.Audio(_show(7))
+
+######################################################################
+# 
 
+IPython.display.Audio(_show(8))
 
 ######################################################################
 # Conclusion
diff --git a/examples/gallery/wav2vec2/speech_recognition_pipeline_tutorial.py b/examples/gallery/wav2vec2/speech_recognition_pipeline_tutorial.py
index afd98e4173..bf955f2600 100644
--- a/examples/gallery/wav2vec2/speech_recognition_pipeline_tutorial.py
+++ b/examples/gallery/wav2vec2/speech_recognition_pipeline_tutorial.py
@@ -120,7 +120,7 @@
 # Creative Commos BY 4.0.
 # 
 
-IPython.display.display(IPython.display.Audio(SPEECH_FILE))
+IPython.display.Audio(SPEECH_FILE)
 
 
 ######################################################################
@@ -273,7 +273,7 @@ def forward(self, emission: torch.Tensor) -> str:
 # 
 
 print(transcript)
-IPython.display.display(IPython.display.Audio(SPEECH_FILE))
+IPython.display.Audio(SPEECH_FILE)
 
 
 ######################################################################

From 3616c10de99712b71223fce6d82ea2dd01a7f738 Mon Sep 17 00:00:00 2001
From: moto <855818+mthrok@users.noreply.github.com>
Date: Fri, 5 Nov 2021 10:46:56 -0400
Subject: [PATCH 2/2] Tweak a bit

---
 .../wav2vec2/forced_alignment_tutorial.py     | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/gallery/wav2vec2/forced_alignment_tutorial.py b/examples/gallery/wav2vec2/forced_alignment_tutorial.py
index f43d83d833..e9df67c207 100644
--- a/examples/gallery/wav2vec2/forced_alignment_tutorial.py
+++ b/examples/gallery/wav2vec2/forced_alignment_tutorial.py
@@ -424,8 +424,8 @@ def plot_alignments(trellis, segments, word_segments, waveform):
 
 # A trick to embed the resulting audio to the generated file.
 # `IPython.display.Audio` has to be the last call in a cell,
-# and there should be only one call.
-def _show(i):
+# and there should be only one call par cell.
+def display_segment(i):
   ratio = waveform.size(1) / (trellis.size(0) - 1)
   word = word_segments[i]
   x0 = int(ratio * word.start)
@@ -433,7 +433,7 @@ def _show(i):
   filename = f"{i}_{word.label}.wav"
   torchaudio.save(filename, waveform[:, x0:x1], bundle.sample_rate)
   print(f"{word.label} ({word.score:.2f}): {x0 / bundle.sample_rate:.3f} - {x1 / bundle.sample_rate:.3f} sec")
-  return filename
+  return IPython.display.Audio(filename)
 
 ######################################################################
 # 
@@ -446,47 +446,47 @@ def _show(i):
 ######################################################################
 # 
 
-IPython.display.Audio(_show(0))
+display_segment(0)
 
 ######################################################################
 # 
 
-IPython.display.Audio(_show(1))
+display_segment(1)
 
 ######################################################################
 # 
 
-IPython.display.Audio(_show(2))
+display_segment(2)
 
 ######################################################################
 # 
 
-IPython.display.Audio(_show(3))
+display_segment(3)
 
 ######################################################################
 # 
 
-IPython.display.Audio(_show(4))
+display_segment(4)
 
 ######################################################################
 # 
 
-IPython.display.Audio(_show(5))
+display_segment(5)
 
 ######################################################################
 # 
 
-IPython.display.Audio(_show(6))
+display_segment(6)
 
 ######################################################################
 # 
 
-IPython.display.Audio(_show(7))
+display_segment(7)
 
 ######################################################################
 # 
 
-IPython.display.Audio(_show(8))
+display_segment(8)
 
 ######################################################################
 # Conclusion