lecture2notes/end_to_end/transcribe/webrtcvad_utils.py

# Code Borrowed From:
# 1. https://github.com/wiseman/py-webrtcvad/blob/3b39545dbb026d998bf407f1cb86e0ed6192a5a6/example.py
# 2. https://github.com/mozilla/DeepSpeech-examples/blob/r0.7/vad_transcriber/wavTranscriber.py

import collections
import logging

import webrtcvad

logger = logging.getLogger(__name__)


class Frame:
    """Represents a "frame" of audio data."""

    def __init__(self, bytes, timestamp, duration):
        self.bytes = bytes
        self.timestamp = timestamp
        self.duration = duration


def frame_generator(frame_duration_ms, audio, sample_rate):
    """Generates audio frames from PCM audio data.

    Takes the desired frame duration in milliseconds, the PCM data, and
    the sample rate.

    Yields Frames of the requested duration.
    """
    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
    offset = 0
    timestamp = 0.0
    duration = (float(n) / sample_rate) / 2.0
    while offset + n < len(audio):
        yield Frame(audio[offset : offset + n], timestamp, duration)
        timestamp += duration
        offset += n


def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames):
    """Filters out non-voiced audio frames.

    Given a webrtcvad.Vad and a source of audio frames, yields only
    the voiced audio.

    Uses a padded, sliding window algorithm over the audio frames.
    When more than 90% of the frames in the window are voiced (as
    reported by the VAD), the collector triggers and begins yielding
    audio frames. Then the collector waits until 90% of the frames in
    the window are unvoiced to detrigger.

    The window is padded at the front and back to provide a small
    amount of silence or the beginnings/endings of speech around the
    voiced frames.

    Args:

        sample_rate: The audio sample rate, in Hz.
        frame_duration_ms: The frame duration in milliseconds.
        padding_duration_ms: The amount to pad the window, in milliseconds.
        vad: An instance of webrtcvad.Vad.
        frames: a source of audio frames (sequence or generator).

    Returns:
        [generator]: A generator that yields PCM audio data.
    """
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
    # We use a deque for our sliding window/ring buffer.
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
    # NOTTRIGGERED state.
    triggered = False

    voiced_frames = []
    for frame in frames:
        is_speech = vad.is_speech(frame.bytes, sample_rate)

        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            # If we're NOTTRIGGERED and more than 90% of the frames in
            # the ring buffer are voiced frames, then enter the
            # TRIGGERED state.
            if num_voiced > 0.9 * ring_buffer.maxlen:
                triggered = True
                # We want to yield all the audio we see from now until
                # we are NOTTRIGGERED, but we have to start with the
                # audio that's already in the ring buffer.
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                ring_buffer.clear()
        else:
            # We're in the TRIGGERED state, so collect the audio data
            # and add it to the ring buffer.
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            # If more than 90% of the frames in the ring buffer are
            # unvoiced, then enter NOTTRIGGERED and yield whatever
            # audio we've collected.
            if num_unvoiced > 0.9 * ring_buffer.maxlen:
                triggered = False
                yield b"".join([f.bytes for f in voiced_frames])
                ring_buffer.clear()
                voiced_frames = []
    if triggered:
        pass
    # If we have any leftover voiced audio when we run out of input,
    # yield it.
    if voiced_frames:
        yield b"".join([f.bytes for f in voiced_frames])


def vad_segment_generator(wavFile, aggressiveness, desired_sample_rate=None):
    """
    Generate VAD segments. Filters out non-voiced audio frames.

    Args:
        waveFile (str): Path to input wav file to run VAD on.

    Returns:
        [tuple]:
            ``segments``: a bytearray of multiple smaller audio frames
            (The longer audio split into multiple smaller one's)

            ``sample_rate``: Sample rate of the input audio file

            ``audio_length``: Duration of the input audio file
    """
    from .transcribe_main import read_wave

    logging.debug("Caught the wav file @: %s", (wavFile))
    audio, sample_rate, audio_length = read_wave(wavFile, desired_sample_rate)
    if sample_rate not in (
        8000,
        16000,
        32000,
        48000,
    ):
        raise AssertionError("The WebRTC VAD only accepts 16-bit mono PCM audio, sampled at 8000, 16000, 32000 or 48000 Hz.")
    vad = webrtcvad.Vad(int(aggressiveness))
    frames = frame_generator(30, audio, sample_rate)
    frames = list(frames)
    segments = vad_collector(sample_rate, 30, 300, vad, frames)

    return segments, sample_rate, audio_length