openai · jongwook · Mar 6, 2023 · Jan 20, 2023 · Jan 21, 2023 · Jan 21, 2023
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -21,6 +21,5 @@ jobs:
  - run: conda install -n test ffmpeg python=${{ matrix.python-version }} pytorch=${{ matrix.pytorch-version }} cpuonly -c pytorch
  - uses: actions/checkout@v2
  - run: echo "$CONDA/envs/test/bin" >> $GITHUB_PATH
- - run: pip install pytest
- - run: pip install .
- - run: pytest --durations=0 -vv -k 'not test_transcribe or test_transcribe[tiny] or test_transcribe[tiny.en]'
+ - run: pip install .["dev"]
+ - run: pytest --durations=0 -vv -k 'not test_transcribe or test_transcribe[tiny] or test_transcribe[tiny.en]' -m 'not requires_cuda'
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
+numba
 numpy
 torch
 tqdm

diff --git a/setup.py b/setup.py
@@ -1,4 +1,5 @@
 import os
+import sys
 
 import pkg_resources
 from setuptools import setup, find_packages
@@ -9,6 +10,21 @@ def read_version(fname="whisper/version.py"):
  return locals()["__version__"]
 
 
+requirements = []
+if sys.platform.startswith("linux"):
+ triton_requirement = "triton>=2.0.0.dev20221202"
+ try:
+ import re
+ import subprocess
+ version_line = subprocess.check_output(["nvcc", "--version"]).strip().split(b"\n")[-1]
+ major, minor = re.findall(rb"cuda_([\d]+)\.([\d]+)", version_line)[0]
+ if (int(major), int(minor)) < (11, 4):
+ # the last version supporting CUDA < 11.4
+ triton_requirement = "triton==2.0.0.dev20221011"
+ except (IndexError, OSError, subprocess.SubprocessError):
+ pass
+ requirements.append(triton_requirement)
+
 setup(
  name="openai-whisper",
  py_modules=["whisper"],
@@ -22,7 +38,7 @@ def read_version(fname="whisper/version.py"):
  url="https://github.com/openai/whisper",
  license="MIT",
  packages=find_packages(exclude=["tests*"]),
- install_requires=[
+ install_requires=requirements + [
  str(r)
  for r in pkg_resources.parse_requirements(
  open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
@@ -32,5 +48,5 @@ def read_version(fname="whisper/version.py"):
  "console_scripts": ["whisper=whisper.transcribe:cli"],
  },
  include_package_data=True,
- extras_require={"dev": ["pytest"]},
+ extras_require={"dev": ["pytest", "scipy"]},
 )
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,10 @@
+import random as rand
+
+import numpy
+import pytest
+
+
+@pytest.fixture
+def random():
+ rand.seed(42)
+ numpy.random.seed(42)
diff --git a/tests/test_timing.py b/tests/test_timing.py
@@ -0,0 +1,82 @@
+import pytest
+import numpy as np
+import scipy.ndimage
+import torch
+
+from whisper.timing import dtw_cpu, dtw_cuda, median_filter
+
+
+sizes = [
+ (10, 20), (32, 16), (123, 1500), (234, 189),
+]
+shapes = [
+ (4, 5, 20, 345), (6, 12, 240, 512),
+]
+
+
+@pytest.mark.parametrize("N, M", sizes)
+def test_dtw(N: int, M: int):
+ steps = np.concatenate([np.zeros(N - 1), np.ones(M - 1)])
+ np.random.shuffle(steps)
+ x = np.random.random((N, M)).astype(np.float32)
+
+ i, j, k = 0, 0, 0
+ trace = []
+ while True:
+ x[i, j] -= 1
+ trace.append((i, j))
+
+ if k == len(steps):
+ break
+
+ if k + 1 < len(steps) and steps[k] != steps[k + 1]:
+ i += 1
+ j += 1
+ k += 2
+ continue
+
+ if steps[k] == 0:
+ i += 1
+ if steps[k] == 1:
+ j += 1
+ k += 1
+
+ trace = np.array(trace).T
+ dtw_trace = dtw_cpu(x)
+
+ assert np.allclose(trace, dtw_trace)
+
+
+@pytest.mark.requires_cuda
+@pytest.mark.parametrize("N, M", sizes)
+def test_dtw_cuda_equivalence(N: int, M: int):
+ x_numpy = np.random.randn(N, M).astype(np.float32)
+ x_cuda = torch.from_numpy(x_numpy).cuda()
+
+ trace_cpu = dtw_cpu(x_numpy)
+ trace_cuda = dtw_cuda(x_cuda)
+
+ assert np.allclose(trace_cpu, trace_cuda)
+
+
+@pytest.mark.parametrize("shape", shapes)
+def test_median_filter(shape):
+ x = torch.randn(*shape)
+
+ for filter_width in [3, 5, 7, 13]:
+ filtered = median_filter(x, filter_width)
+ scipy_filtered = scipy.ndimage.median_filter(x, (1, 1, 1, filter_width), mode="nearest")
+
+ assert np.allclose(filtered, scipy_filtered)
+
+
+@pytest.mark.requires_cuda
+@pytest.mark.parametrize("shape", shapes)
+def test_median_filter_equivalence(shape):
+ x = torch.randn(*shape)
+
+ for filter_width in [3, 5, 7, 13]:
+ filtered_cpu = median_filter(x, filter_width)
+ filtered_gpu = median_filter(x.cuda(), filter_width).cpu()
+
+ assert np.allclose(filtered_cpu, filtered_gpu)
diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py
@@ -13,10 +13,21 @@ def test_transcribe(model_name: str):
  audio_path = os.path.join(os.path.dirname(__file__), "jfk.flac")
 
  language = "en" if model_name.endswith(".en") else None
- result = model.transcribe(audio_path, language=language, temperature=0.0)
+ result = model.transcribe(audio_path, language=language, temperature=0.0, word_timestamps=True)
  assert result["language"] == "en"
 
  transcription = result["text"].lower()
  assert "my fellow americans" in transcription
  assert "your country" in transcription
  assert "do for you" in transcription
+
+ timing_checked = False
+ for segment in result["segments"]:
+ for timing in segment["words"]:
+ assert timing["start"] < timing["end"]
+ if timing["word"].strip() == "Americans":
+ assert timing["start"] <= 1.75
+ assert timing["end"] >= 2.05
+ timing_checked = True
+
+ assert timing_checked
diff --git a/whisper/audio.py b/whisper/audio.py
@@ -18,6 +18,10 @@
 N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000: number of samples in a chunk
 N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH) # 3000: number of frames in a mel spectrogram input
 
+N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2 # the initial convolutions has stride 2
+FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH) # 100 mel frames in 1s (10ms each)
+TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN) # 50 audio tokens in 1s (20ms each)
+
 
 def load_audio(file: str, sr: int = SAMPLE_RATE):
  """

diff --git a/whisper/timing.py b/whisper/timing.py
@@ -0,0 +1,198 @@
+from typing import List, TYPE_CHECKING
+
+import numba
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from .audio import HOP_LENGTH, SAMPLE_RATE, TOKENS_PER_SECOND
+from .tokenizer import Tokenizer
+
+if TYPE_CHECKING:
+ from .model import Whisper
+
+
+def median_filter(x: torch.Tensor, filter_width: int):
+ """Apply a median filter of width `filter_width` along the last dimension of `x`"""
+ assert 3 <= x.ndim <= 4, "`median_filter()` is implemented for only 3D or 4D tensors"
+ assert filter_width > 0 and filter_width % 2 == 1, "`filter_width` should be an odd number"
+
+ x = F.pad(x, (filter_width // 2, filter_width // 2, 0, 0), mode='replicate')
+ if x.is_cuda:
+ from .triton_ops import median_filter_cuda
+ return median_filter_cuda(x, filter_width)
+
+ # sort() is faster than torch.median (https://github.com/pytorch/pytorch/issues/51450)
+ return x.unfold(-1, filter_width, 1).sort()[0][..., filter_width // 2]
+
+
+@numba.jit
+def backtrace(trace: np.ndarray):
+ i = trace.shape[0] - 1
+ j = trace.shape[1] - 1
+ trace[0, :] = 2
+ trace[:, 0] = 1
+
+ result = []
+ while i > 0 or j > 0:
+ result.append((i - 1, j - 1))
+
+ if trace[i, j] == 0:
+ i -= 1
+ j -= 1
+ elif trace[i, j] == 1:
+ i -= 1
+ elif trace[i, j] == 2:
+ j -= 1
+ else:
+ raise ValueError("Unexpected trace[i, j]")
+
+ result = np.array(result)
+ return result[::-1, :].T
+
+
+@numba.jit(nopython=True, parallel=True)
+def dtw_cpu(x: np.ndarray):
+ N, M = x.shape
+ cost = np.ones((N + 1, M + 1), dtype=np.float32) * np.inf
+ trace = -np.ones((N + 1, M + 1), dtype=np.float32)
+
+ cost[0, 0] = 0
+ for j in range(1, M + 1):
+ for i in range(1, N + 1):
+ c0 = cost[i - 1, j - 1]
+ c1 = cost[i - 1, j]
+ c2 = cost[i, j - 1]
+
+ if c0 < c1 and c0 < c2:
+ c, t = c0, 0
+ elif c1 < c0 and c1 < c2:
+ c, t = c1, 1
+ else:
+ c, t = c2, 2
+
+ cost[i, j] = x[i - 1, j - 1] + c
+ trace[i, j] = t
+
+ return backtrace(trace)
+
+
+def dtw_cuda(x, BLOCK_SIZE=1024):
+ from .triton_ops import dtw_kernel
+
+ M, N = x.shape
+ assert M < BLOCK_SIZE, f"M should be smaller than {BLOCK_SIZE=}"
+
+ x_skew = F.pad(x, (0, M + 1), value=np.inf).flatten()[: M * (N + M)].reshape(M, N + M)
+ x_skew = x_skew.T.contiguous()
+ cost = torch.ones(N + M + 2, M + 2) * np.inf
+ cost[0, 0] = 0
+ cost = cost.cuda()
+ trace = torch.zeros_like(cost, dtype=torch.int32)
+
+ dtw_kernel[(1,)](
+ cost,
+ trace,
+ x_skew,
+ x_skew.stride(0),
+ cost.stride(0),
+ trace.stride(0),
+ N,
+ M,
+ BLOCK_SIZE=BLOCK_SIZE
+ )
+
+ trace = trace.T.flatten()[:(M + 1) * (M + N + 3)].reshape(M + 1, M + N + 3)[:, :N + 1]
+ return backtrace(trace.cpu().numpy())
+
+
+def dtw(x: torch.Tensor) -> np.ndarray:
+ if x.is_cuda:
+ return dtw_cuda(x)
+
+ return dtw_cpu(x.double().cpu().numpy())
+
+
+def add_word_timestamps(
+ model: "Whisper",
+ tokenizer: Tokenizer,
+ mel: torch.Tensor,
+ num_frames: int,
+ segments: List[dict],
+ *,
+ medfilt_width: int = 7,
+ qk_scale: float = 1.0,
+):
+ if len(segments) == 0:
+ return
+
+ # install hooks on the cross attention layers to retrieve the attention weights
+ QKs = [None] * model.dims.n_text_layer
+ hooks = [
+ block.cross_attn.register_forward_hook(
+ lambda _, ins, outs, index=i: QKs.__setitem__(index, outs[-1])
+ )
+ for i, block in enumerate(model.decoder.blocks)
+ ]
+
+ tokens = torch.tensor(
+ [
+ *tokenizer.sot_sequence,
+ tokenizer.timestamp_begin,
+ *[t for segment in segments for t in segment["tokens"]],
+ tokenizer.timestamp_begin + mel.shape[-1] // 2,
+ tokenizer.eot,
+ ]
+ ).to(model.device)
+
+ with torch.no_grad():
+ model(mel.unsqueeze(0), tokens.unsqueeze(0))
+
+ for hook in hooks:
+ hook.remove()
+
+ weights = torch.cat(QKs[-6:]) # layers * heads * tokens * frames
+ weights = weights[:, :, :, : num_frames // 2]
+ weights = median_filter(weights, medfilt_width)
+ weights = (weights * qk_scale).softmax(dim=-1)
+ weights = weights / weights.norm(dim=-2, keepdim=True)
+ matrix = weights.mean(axis=(0, 1)).neg()
+
+ text_indices, time_indices = dtw(matrix)
+
+ jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
+ jump_times = time_indices[jumps] / TOKENS_PER_SECOND
+
+ if tokenizer.language in {"zh", "ja", "th", "lo", "my"}:
+ # These languages don't typically use spaces, so it is difficult to split words
+ # without morpheme analysis. Here, we instead split words at any
+ # position where the tokens are decoded as valid unicode points
+ split_tokens = tokenizer.split_tokens_on_unicode
+ else:
+ split_tokens = tokenizer.split_tokens_on_spaces
+
+ words, word_tokens = split_tokens(tokens[1:].tolist())
+
+ token_sources = np.repeat(np.arange(len(segments)), [len(s["tokens"]) for s in segments])
+ token_sources = [None] * len(tokenizer.sot_sequence) + list(token_sources)
+
+ time_offset = segments[0]["seek"] * HOP_LENGTH / SAMPLE_RATE
+ word_boundaries = np.pad(np.cumsum([len(t) for t in word_tokens]), (1, 0))
+ start_times = time_offset + jump_times[word_boundaries[:-1]]
+ end_times = time_offset + jump_times[word_boundaries[1:]]
+
+ for segment in segments:
+ segment["words"] = []
+
+ for i, (word, start, end) in enumerate(zip(words, start_times, end_times)):
+ if word.startswith("<|") or word.strip() in ".,!?、。": # TODO: expand
+ continue
+
+ segment = segments[token_sources[word_boundaries[i]]]
+ segment["words"].append(dict(word=word, start=round(start, 2), end=round(end, 2)))
+
+ # adjust the segment-level timestamps based on the word-level timestamps
+ for segment in segments:
+ if len(segment["words"]) > 0:
+ segment["start"] = segment["words"][0]["start"]
+ segment["end"] = segment["words"][-1]["end"]