Add GPU acceleration (timsainb#14)

* added pad clipping flag * added tensorflow backend * updated docstrings * updated readme * updated readme * updated package version * updated readme * updated readme * updated website * updated testing * updated testing and black formatting * update testing * added test requirements * updated travis yml to install requirements-txt * updated travis yml * updated test-req
lff5 · Jun 11, 2019 · db94fe2 · db94fe2
1 parent 3cfc83e
commit db94fe2
Show file tree

Hide file tree

Showing 11 changed files with 616 additions and 134 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -6,4 +6,7 @@ python:
 script:
  - pytest --cov=noisereduce/
 after_success:
- - coveralls
+ - coveralls
+install:
+ - pip install -r requirements.txt
+ - pip install -r requirements-test.txt
diff --git a/README.md b/README.md
@@ -23,6 +23,8 @@
 ## Installation
 `pip install noisereduce`
 
+*noisereduce optionally uses Tensorflow as a backend to speed up FFT and gaussian convolution. It is not listed in the requirements.txt so because (1) it is optional and (2) tensorflow-gpu and tensorflow (cpu) are both compatible with this package. The package requires Tensorflow 2+ for all tensorflow operations.* 
+
 ## Usage
 (see notebooks)
 
@@ -45,6 +47,8 @@ win_length (int): Each frame of audio is windowed by `window()`. The window will
 hop_length (int):number audio of frames between STFT columns.
 n_std_thresh (int): how many standard deviations louder than the mean dB of the noise (at each frequency level) to be considered signal
 prop_decrease (float): To what extent should you decrease noise (1 = all, 0 = none)
+pad_clipping (bool): Pad the signals with zeros to ensure that the reconstructed data is equal length to the data
+ use_tensorflow (bool): Use tensorflow as a backend for convolution and fft to speed up computation
 verbose (bool): Whether to plot the steps of the algorithm
 ```
 <div style="text-align:center">

diff --git a/environment.yml b/environment.yml
@@ -1,4 +1,4 @@
-name: birdbrain
+name: noisereduce
 channels:
  - conda-forge
  - defaults

diff --git a/noisereduce/._noisereduce.py b/noisereduce/._noisereduce.py
diff --git a/noisereduce/noisereduce.py b/noisereduce/noisereduce.py
@@ -3,16 +3,73 @@
 import librosa
 from noisereduce.plotting import plot_reduction_steps
 from tqdm.autonotebook import tqdm
+import warnings
 
+try:
+ import tensorflow as tf
+
+ print(
+ "GPUs available: {}".format(tf.config.experimental.list_physical_devices("GPU"))
+ )
+ if int(tf.__version__[0]) < 2:
+ warnings.warn(
+ "Tensorflow version is below 2.0, some GPU accelerated functionality may not work"
+ )
+except ImportError:
+ warnings.warn(
+ "Tensorflow is not installed and cannot be used for GPU accelerated STFT"
+ )
+
+
+def _stft(y, n_fft, hop_length, win_length, use_tensorflow=False):
+ if use_tensorflow:
+ # return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=True)
+ return _stft_tensorflow(y, n_fft, hop_length, win_length)
+ else:
+ return librosa.stft(
+ y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=True
+ )
+
+
+def _istft(y, n_fft, hop_length, win_length, use_tensorflow=False):
+ if use_tensorflow:
+ # return librosa.istft(y, hop_length, win_length)
+ return _istft_tensorflow(y.T, n_fft, hop_length, win_length)
+ else:
+ return librosa.istft(y, hop_length, win_length)
 
-def _stft(y, n_fft, hop_length, win_length):
- return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
+
+def _stft_librosa(y, n_fft, hop_length, win_length):
+ return librosa.stft(
+ y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=True
+ )
 
 
-def _istft(y, hop_length, win_length):
+def _istft_librosa(y, hop_length, win_length):
  return librosa.istft(y, hop_length, win_length)
 
 
+def _stft_tensorflow(y, n_fft, hop_length, win_length):
+ return (
+ tf.signal.stft(
+ y,
+ win_length,
+ hop_length,
+ n_fft,
+ pad_end=True,
+ window_fn=tf.signal.hann_window,
+ )
+ .numpy()
+ .T
+ )
+
+
+def _istft_tensorflow(y, n_fft, hop_length, win_length):
+ return tf.signal.inverse_stft(
+ y.astype(np.complex64), win_length, hop_length, n_fft
+ ).numpy()
+
+
 def _amp_to_db(x):
  return librosa.core.amplitude_to_db(x, ref=1.0, amin=1e-20, top_db=80.0)
 
@@ -31,9 +88,7 @@ def update_pbar(pbar, message):
 
 def _smoothing_filter(n_grad_freq, n_grad_time):
  """Generates a filter to smooth the mask for the spectrogram
- 
- [description]
- 
+ 
  Arguments:
  n_grad_freq {[type]} -- [how many frequency channels to smooth over with the mask.]
  n_grad_time {[type]} -- [how many time channels to smooth over with the mask.]
@@ -58,10 +113,8 @@ def _smoothing_filter(n_grad_freq, n_grad_time):
 
 
 def mask_signal(sig_stft_db, sig_mask, mask_gain_dB, sig_stft):
- """[summary]
- 
- [description]
- 
+ """ Reduces amplitude of time/frequency regions of a spectrogram based upon a mask 
+ 
  Arguments:
  sig_stft_db {[type]} -- spectrogram of signal in dB
  sig_mask {[type]} -- mask to apply to signal
@@ -83,6 +136,33 @@ def mask_signal(sig_stft_db, sig_mask, mask_gain_dB, sig_stft):
  return sig_stft_amp, sig_stft_db_masked
 
 
+def convolve_gaussian(sig_mask, smoothing_filter, use_tensorflow=False):
+ """ Convolves a gaussian filter with a mask (or any image)
+ 
+ Arguments:
+ sig_mask {[type]} -- The signal mask
+ smoothing_filter {[type]} -- the filter to convolve
+ 
+ Keyword Arguments:
+ use_tensorflow {bool} -- use tensorflow.signal or scipy.signal (default: {False})
+ """
+ if use_tensorflow:
+ smoothing_filter = smoothing_filter * (
+ (np.shape(smoothing_filter)[1] - 1) / 2 + 1
+ )
+ smoothing_filter = smoothing_filter[:, :, tf.newaxis, tf.newaxis].astype(
+ "float32"
+ )
+ img = sig_mask[:, :, tf.newaxis, tf.newaxis].astype("float32")
+ return (
+ tf.nn.conv2d(img, smoothing_filter, strides=[1, 1, 1, 1], padding="SAME")
+ .numpy()
+ .squeeze()
+ )
+ else:
+ return scipy.signal.fftconvolve(sig_mask, smoothing_filter, mode="same")
+
+
 def reduce_noise(
  audio_clip,
  noise_clip,
@@ -93,6 +173,8 @@ def reduce_noise(
  hop_length=512,
  n_std_thresh=1.5,
  prop_decrease=1.0,
+ pad_clipping=True,
+ use_tensorflow=False,
  verbose=False,
 ):
  """Remove noise from audio based upon a clip containing only noise
@@ -107,6 +189,8 @@ def reduce_noise(
  hop_length (int):number audio of frames between STFT columns.
  n_std_thresh (int): how many standard deviations louder than the mean dB of the noise (at each frequency level) to be considered signal
  prop_decrease (float): To what extent should you decrease noise (1 = all, 0 = none)
+ pad_clipping (bool): Pad the signals with zeros to ensure that the reconstructed data is equal length to the data
+ use_tensorflow (bool): Use tensorflow as a backend for convolution and fft to speed up computation
  verbose (bool): Whether to plot the steps of the algorithm
 
  Returns:
@@ -120,7 +204,9 @@ def reduce_noise(
 
  update_pbar(pbar, "STFT on noise")
  # STFT over noise
- noise_stft = _stft(noise_clip, n_fft, hop_length, win_length)
+ noise_stft = _stft(
+ noise_clip, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow
+ )
  noise_stft_db = _amp_to_db(np.abs(noise_stft)) # convert to dB
  # Calculate statistics over noise
  update_pbar(pbar, "STFT on signal")
@@ -129,7 +215,15 @@ def reduce_noise(
  noise_thresh = mean_freq_noise + std_freq_noise * n_std_thresh
  # STFT over signal
  update_pbar(pbar, "STFT on signal")
- sig_stft = _stft(audio_clip, n_fft, hop_length, win_length)
+
+ # pad signal with zeros to avoid extra frames being clipped if desired
+ if pad_clipping:
+ nsamp = len(audio_clip)
+ audio_clip = np.pad(audio_clip, [0, hop_length], mode="constant")
+
+ sig_stft = _stft(
+ audio_clip, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow
+ )
  sig_stft_db = _amp_to_db(np.abs(sig_stft))
  update_pbar(pbar, "Generate mask")
  # Calculate value to mask dB to
@@ -145,19 +239,38 @@ def reduce_noise(
  update_pbar(pbar, "Smooth mask")
  # Create a smoothing filter for the mask in time and frequency
  smoothing_filter = _smoothing_filter(n_grad_freq, n_grad_time)
+
  # convolve the mask with a smoothing filter
+ sig_mask = convolve_gaussian(sig_mask, smoothing_filter, use_tensorflow)
+
  sig_mask = scipy.signal.fftconvolve(sig_mask, smoothing_filter, mode="same")
  sig_mask = sig_mask * prop_decrease
  update_pbar(pbar, "Apply mask")
  # mask the signal
+
  sig_stft_amp, sig_stft_db_masked = mask_signal(
  sig_stft_db, sig_mask, mask_gain_dB, sig_stft
  )
+
  update_pbar(pbar, "Recover signal")
  # recover the signal
- recovered_signal = _istft(sig_stft_amp, hop_length, win_length)
+ recovered_signal = _istft(
+ sig_stft_amp, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow
+ )
+ # fix the recovered signal length if padding signal
+ if pad_clipping:
+ recovered_signal = librosa.util.fix_length(recovered_signal, nsamp)
+
  recovered_spec = _amp_to_db(
- np.abs(_stft(recovered_signal, n_fft, hop_length, win_length))
+ np.abs(
+ _stft(
+ recovered_signal,
+ n_fft,
+ hop_length,
+ win_length,
+ use_tensorflow=use_tensorflow,
+ )
+ )
  )
  if verbose:
  plot_reduction_steps(

diff --git a/noisereduce/utils.py b/noisereduce/utils.py
@@ -0,0 +1,15 @@
+import numpy as np
+
+
+def int16_to_float32(data):
+ """ Converts from uint16 wav to float32 wav
+ """
+ if np.max(np.abs(data)) > 32768:
+ raise ValueError("Data has values above 32768")
+ return (data / 32768.0).astype("float32")
+
+
+def float32_to_int16(data):
+ if np.max(data) > 1:
+ data = data / np.max(np.abs(data))
+ return np.array(data * 32767).astype("int16")