Skip to content

Commit

Permalink
chore: Refactor cleanstream-filter-data.h and update timestamps in wh…
Browse files Browse the repository at this point in the history
…isper-processing.cpp
  • Loading branch information
royshil committed May 13, 2024
1 parent 78bf45e commit a1a2a20
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 44 deletions.
4 changes: 2 additions & 2 deletions src/cleanstream-filter-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ struct cleanstream_data {
uint32_t sample_rate; // input sample rate
// How many input frames (in input sample rate) are needed for the next whisper frame
size_t frames;
// How many frames were processed in the last whisper frame (this is dynamic)
size_t last_num_frames;
int current_result;
uint64_t current_result_end_timestamp;
uint64_t current_result_start_timestamp;
uint32_t delay_ms;

/* Silero VAD */
Expand Down
84 changes: 51 additions & 33 deletions src/cleanstream-filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,48 +58,56 @@ struct obs_audio_data *cleanstream_filter_audio(void *data, struct obs_audio_dat
return audio;
}

std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex); // scoped lock
size_t input_buffer_size = 0;
{
std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex); // scoped lock

if (audio != nullptr && audio->frames > 0) {
// push back current audio data to input circlebuf
for (size_t c = 0; c < gf->channels; c++) {
circlebuf_push_back(&gf->input_buffers[c], audio->data[c],
audio->frames * sizeof(float));
if (audio != nullptr && audio->frames > 0) {
// push back current audio data to input circlebuf
for (size_t c = 0; c < gf->channels; c++) {
circlebuf_push_back(&gf->input_buffers[c], audio->data[c],
audio->frames * sizeof(float));
}
// push audio packet info (timestamp/frame count) to info circlebuf
struct cleanstream_audio_info info = {0};
info.frames = audio->frames; // number of frames in this packet
info.timestamp = audio->timestamp; // timestamp of this packet
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
}
// push audio packet info (timestamp/frame count) to info circlebuf
struct cleanstream_audio_info info = {0};
info.frames = audio->frames; // number of frames in this packet
info.timestamp = audio->timestamp; // timestamp of this packet
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
input_buffer_size = gf->input_buffers[0].size;
}

// check the size of the input buffer - if it's more than <delay>ms worth of audio, start playback
if (gf->input_buffers[0].size > gf->delay_ms * gf->sample_rate * sizeof(float) / 1000) {
if (input_buffer_size > gf->delay_ms * gf->sample_rate * sizeof(float) / 1000) {
// find needed number of frames from the incoming audio
size_t num_frames_needed = audio->frames;

std::vector<float> temporary_buffers[MAX_AUDIO_CHANNELS];
uint64_t timestamp = 0;

while (temporary_buffers[0].size() < num_frames_needed) {
struct cleanstream_audio_info info_out = {0};
{
std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex);
// pop from input buffers to get audio packet info
circlebuf_pop_front(&gf->info_buffer, &info_out, sizeof(info_out));
if (timestamp == 0) {
timestamp = info_out.timestamp;
}
while (temporary_buffers[0].size() < num_frames_needed) {
struct cleanstream_audio_info info_out = {0};
// pop from input buffers to get audio packet info
circlebuf_pop_front(&gf->info_buffer, &info_out, sizeof(info_out));
if (timestamp == 0) {
timestamp = info_out.timestamp;
}

// pop from input circlebuf to audio data
for (size_t i = 0; i < gf->channels; i++) {
// increase the size of the temporary buffer to hold the incoming audio in addition
// to the existing audio on the temporary buffer
temporary_buffers[i].resize(temporary_buffers[i].size() +
info_out.frames);
circlebuf_pop_front(&gf->input_buffers[i],
temporary_buffers[i].data() +
temporary_buffers[i].size() -
info_out.frames,
info_out.frames * sizeof(float));
// pop from input circlebuf to audio data
for (size_t i = 0; i < gf->channels; i++) {
// increase the size of the temporary buffer to hold the incoming audio in addition
// to the existing audio on the temporary buffer
temporary_buffers[i].resize(temporary_buffers[i].size() +
info_out.frames);
circlebuf_pop_front(&gf->input_buffers[i],
temporary_buffers[i].data() +
temporary_buffers[i].size() -
info_out.frames,
info_out.frames * sizeof(float));
}
}
}
const size_t num_frames = temporary_buffers[0].size();
Expand All @@ -110,13 +118,17 @@ struct obs_audio_data *cleanstream_filter_audio(void *data, struct obs_audio_dat
memset(gf->output_data.array, 0, frames_size_bytes * gf->channels);

int inference_result = DetectionResult::DETECTION_RESULT_UNKNOWN;
uint64_t inference_result_start_timestamp = 0;
uint64_t inference_result_end_timestamp = 0;
{
std::lock_guard<std::mutex> lock(gf->whisper_outbuf_mutex);
std::lock_guard<std::mutex> outbuf_lock(gf->whisper_outbuf_mutex);
inference_result = gf->current_result;
inference_result_start_timestamp = gf->current_result_start_timestamp;
inference_result_end_timestamp = gf->current_result_end_timestamp;
}

if (inference_result == DetectionResult::DETECTION_RESULT_BEEP) {
obs_log(LOG_INFO, "Beep detected, timestamp: %llu", timestamp);
if (timestamp > inference_result_start_timestamp &&
timestamp < inference_result_end_timestamp) {
if (gf->replace_sound == REPLACE_SOUNDS_SILENCE) {
// set the audio to 0
for (size_t i = 0; i < gf->channels; i++) {
Expand Down Expand Up @@ -220,6 +232,10 @@ void cleanstream_update(void *data, obs_data_t *s)
gf->log_level = (int)obs_data_get_int(s, "log_level");
gf->vad_enabled = obs_data_get_bool(s, "vad_enabled");
gf->log_words = obs_data_get_bool(s, "log_words");
gf->delay_ms = BUFFER_SIZE_MSEC + INITIAL_DELAY_MSEC;
gf->current_result = DetectionResult::DETECTION_RESULT_UNKNOWN;
gf->current_result_start_timestamp = 0;
gf->current_result_end_timestamp = 0;

obs_log(gf->log_level, "update whisper model");
update_whisper_model(gf, s);
Expand Down Expand Up @@ -270,8 +286,10 @@ void *cleanstream_create(obs_data_t *settings, obs_source_t *filter)
gf->channels = audio_output_get_channels(obs_get_audio());
gf->sample_rate = audio_output_get_sample_rate(obs_get_audio());
gf->frames = (size_t)((float)gf->sample_rate / (1000.0f / (float)BUFFER_SIZE_MSEC));
gf->last_num_frames = 0;
gf->delay_ms = BUFFER_SIZE_MSEC + INITIAL_DELAY_MSEC;
gf->current_result = DetectionResult::DETECTION_RESULT_UNKNOWN;
gf->current_result_start_timestamp = 0;
gf->current_result_end_timestamp = 0;

for (size_t i = 0; i < MAX_AUDIO_CHANNELS; i++) {
circlebuf_init(&gf->input_buffers[i]);
Expand Down
27 changes: 18 additions & 9 deletions src/whisper-utils/whisper-processing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,26 +305,31 @@ int run_whisper_inference(struct cleanstream_data *gf, const float *pcm32f_data,
long long process_audio_from_buffer(struct cleanstream_data *gf)
{
uint64_t start_timestamp = 0;
uint64_t end_timestamp = 0;

{
// scoped lock the buffer mutex
std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex);

// copy gf->frames from the end of the input buffer to the copy_buffers
for (size_t c = 0; c < gf->channels; c++) {
circlebuf_peek_front(&gf->input_buffers[c], gf->copy_buffers[c],
gf->frames * sizeof(float));
circlebuf_peek_back(&gf->input_buffers[c], gf->copy_buffers[c],
gf->frames * sizeof(float));
}

// peek at the info_buffer to get the timestamp of the first info
// peek at the info_buffer to get the timestamp of the last info
struct cleanstream_audio_info info_from_buf = {0};
circlebuf_peek_front(&gf->info_buffer, &info_from_buf,
sizeof(struct cleanstream_audio_info));
start_timestamp = info_from_buf.timestamp;
circlebuf_peek_back(&gf->info_buffer, &info_from_buf,
sizeof(struct cleanstream_audio_info));
end_timestamp = info_from_buf.timestamp;
start_timestamp =
end_timestamp - (int)(gf->frames * 1000 / gf->sample_rate) * 1000000;
}

obs_log(gf->log_level, "processing %lu frames (%d ms), start timestamp %llu ", gf->frames,
(int)(gf->frames * 1000 / gf->sample_rate), start_timestamp);
obs_log(gf->log_level,
"processing %lu frames (%d ms), start timestamp %llu, end timestamp %llu ",
gf->frames, (int)(gf->frames * 1000 / gf->sample_rate), start_timestamp,
end_timestamp);

// time the audio processing
auto start = std::chrono::high_resolution_clock::now();
Expand Down Expand Up @@ -361,6 +366,10 @@ long long process_audio_from_buffer(struct cleanstream_data *gf)
{
std::lock_guard<std::mutex> lock(gf->whisper_outbuf_mutex);
gf->current_result = inference_result;
if (gf->current_result == DETECTION_RESULT_BEEP) {
gf->current_result_start_timestamp = start_timestamp;
gf->current_result_end_timestamp = end_timestamp;
}
}
} else {
gf->current_result = DETECTION_RESULT_SILENCE;
Expand All @@ -377,7 +386,7 @@ long long process_audio_from_buffer(struct cleanstream_data *gf)
obs_log(gf->log_level, "audio processing of %u ms new data took %d ms", audio_processed_ms,
(int)duration);

if ((duration + 300) > (gf->delay_ms - audio_processed_ms)) {
if (duration > (gf->delay_ms - audio_processed_ms)) {
obs_log(gf->log_level,
"audio processing (%d ms) longer than delay (%lu ms), increase delay",
(int)duration, gf->delay_ms);
Expand Down

0 comments on commit a1a2a20

Please sign in to comment.