triton implementation of median_filter

openai · jongwook · Mar 6, 2023 · Jan 20, 2023 · Jan 21, 2023 · Jan 21, 2023
commit b61e8f4fd1b912b8d13ec13800bbf80d73905894
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,10 @@
+import random as rand
+
+import numpy
+import pytest
+
+
+@pytest.fixture
+def random():
+ rand.seed(42)
+ numpy.random.seed(42)
diff --git a/tests/test_timing.py b/tests/test_timing.py
@@ -1,18 +1,21 @@
 import pytest
 import numpy as np
+import scipy.ndimage
 import torch
 
-from whisper.timing import dtw_cpu, dtw_cuda
+from whisper.timing import dtw_cpu, dtw_cuda, median_filter
 
 
 sizes = [
- (10, 20), (32, 16), (123, 1500), (234, 189)
+ (10, 20), (32, 16), (123, 1500), (234, 189),
+]
+shapes = [
+ (4, 5, 20, 345), (6, 12, 240, 512),
 ]
 
 
 @pytest.mark.parametrize("N, M", sizes)
 def test_dtw(N: int, M: int):
- np.random.seed(42)
  steps = np.concatenate([np.zeros(N - 1), np.ones(M - 1)])
  np.random.shuffle(steps)
  x = np.random.random((N, M)).astype(np.float32)
@@ -47,11 +50,33 @@ def test_dtw(N: int, M: int):
 @pytest.mark.requires_cuda
 @pytest.mark.parametrize("N, M", sizes)
 def test_dtw_cuda_equivalence(N: int, M: int):
- np.random.seed(42)
  x_numpy = np.random.randn(N, M).astype(np.float32)
  x_cuda = torch.from_numpy(x_numpy).cuda()
 
  trace_cpu = dtw_cpu(x_numpy)
  trace_cuda = dtw_cuda(x_cuda)
 
  assert np.allclose(trace_cpu, trace_cuda)
+
+
+@pytest.mark.parametrize("shape", shapes)
+def test_median_filter(shape):
+ x = torch.randn(*shape)
+
+ for filter_width in [3, 5, 7, 13]:
+ filtered = median_filter(x, filter_width)
+ scipy_filtered = scipy.ndimage.median_filter(x, (1, 1, 1, filter_width), mode="nearest")
+
+ assert np.allclose(filtered, scipy_filtered)
+
+
+@pytest.mark.requires_cuda
+@pytest.mark.parametrize("shape", shapes)
+def test_median_filter_equivalence(shape):
+ x = torch.randn(*shape)
+
+ for filter_width in [3, 5, 7, 13]:
+ filtered_cpu = median_filter(x, filter_width)
+ filtered_gpu = median_filter(x.cuda(), filter_width).cpu()
+
+ assert np.allclose(filtered_cpu, filtered_gpu)
diff --git a/whisper/timing.py b/whisper/timing.py
@@ -17,9 +17,13 @@ def median_filter(x: torch.Tensor, filter_width: int):
  assert 3 <= x.ndim <= 4, "`median_filter()` is implemented for only 3D or 4D tensors"
  assert filter_width > 0 and filter_width % 2 == 1, "`filter_width` should be an odd number"
 
- padded = F.pad(x, (0, 0, filter_width // 2, filter_width // 2), mode='replicate')
- slices = padded.unfold(-1, filter_width, 1)
- return slices.median(dim=-1).values
+ x = F.pad(x, (filter_width // 2, filter_width // 2, 0, 0), mode='replicate')
+ if x.is_cuda:
+ from .triton_ops import median_filter_cuda
+ return median_filter_cuda(x, filter_width)
+
+ # sort() is faster than torch.median (https://github.com/pytorch/pytorch/issues/51450)
+ return x.unfold(-1, filter_width, 1).sort()[0][..., filter_width // 2]
 
 
 @numba.jit

diff --git a/whisper/triton_ops.py b/whisper/triton_ops.py
@@ -1,3 +1,9 @@
+import math
+
+import numpy as np
+import torch
+from functools import lru_cache
+
 try:
  import triton
  import triton.language as tl
@@ -31,3 +37,56 @@ def dtw_kernel(cost, trace, x, x_stride, cost_stride, trace_stride, N, M, BLOCK_
  tl.store(trace_ptr + offsets, 2, mask=mask & (c2 <= c0) & (c2 <= c1))
  tl.store(trace_ptr + offsets, 1, mask=mask & (c1 <= c0) & (c1 <= c2))
  tl.store(trace_ptr + offsets, 0, mask=mask & (c0 <= c1) & (c0 <= c2))
+
+
+@lru_cache(maxsize=None)
+def median_kernel(filter_width: int):
+ @triton.jit
+ def kernel(y, x, x_stride, y_stride, BLOCK_SIZE: tl.constexpr): # x.shape[-1] == filter_width
+ row_idx = tl.program_id(0)
+ offsets = tl.arange(0, BLOCK_SIZE)
+ mask = offsets < y_stride
+
+ x_ptr = x + row_idx * x_stride
+ y_ptr = y + row_idx * y_stride
+
+ LOAD_ALL_ROWS_HERE
+
+ BUBBLESORT_HERE
+
+ tl.store(y_ptr + offsets, MIDDLE_ROW_HERE, mask=mask)
+
+ kernel = triton.JITFunction(kernel.fn)
+ kernel.src = kernel.src.replace(" LOAD_ALL_ROWS_HERE", "\n".join([
+ f" row{i} = tl.load(x_ptr + offsets + {i}, mask=mask)"
+ for i in range(filter_width)
+ ]))
+ kernel.src = kernel.src.replace(" BUBBLESORT_HERE", "\n\n".join([
+ "\n\n".join([
+ "\n".join([
+ f" smaller = tl.where(row{j} < row{j + 1}, row{j}, row{j + 1})",
+ f" larger = tl.where(row{j} > row{j + 1}, row{j}, row{j + 1})",
+ f" row{j} = smaller",
+ f" row{j + 1} = larger",
+ ])
+ for j in range(filter_width - i - 1)
+ ])
+ for i in range(filter_width // 2 + 1)
+ ]))
+ kernel.src = kernel.src.replace("MIDDLE_ROW_HERE", f"row{filter_width // 2}")
+
+ return kernel
+
+
+def median_filter_cuda(x: torch.Tensor, filter_width: int):
+ """Apply a median filter of given width along the last dimension of x"""
+ slices = x.contiguous().unfold(-1, filter_width, 1)
+ grid = np.prod(slices.shape[:-2])
+
+ kernel = median_kernel(filter_width)
+ y = torch.empty_like(slices[..., 0])
+
+ BLOCK_SIZE = 1 << (y.stride(-2) - 1).bit_length()
+ kernel[(grid,)](y, x, x.stride(-2), y.stride(-2), BLOCK_SIZE=BLOCK_SIZE)
+
+ return y