From e8622f9afc4eba139bf796c210f5c01081000472 Mon Sep 17 00:00:00 2001
From: taylorchu <tailinchu@gmail.com>
Date: Mon, 7 Aug 2023 14:48:56 -0700
Subject: [PATCH] word timing tweaks (#1559)

* word timing tweaks

* comment on eot

* clearer comments
---
 whisper/timing.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/whisper/timing.py b/whisper/timing.py
index 207d877d..56e84d43 100644
--- a/whisper/timing.py
+++ b/whisper/timing.py
@@ -214,6 +214,13 @@ def find_alignment(
     text_indices, time_indices = dtw(-matrix)
 
     words, word_tokens = tokenizer.split_to_word_tokens(text_tokens + [tokenizer.eot])
+    if len(word_tokens) <= 1:
+        # return on eot only
+        # >>> np.pad([], (1, 0))
+        # array([0.])
+        # This results in crashes when we lookup jump_times with float, like
+        # IndexError: arrays used as indices must be of integer (or boolean) type
+        return []
     word_boundaries = np.pad(np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0))
 
     jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
@@ -297,8 +304,6 @@ def add_word_timestamps(
     # hack: truncate long words at sentence boundaries.
     # a better segmentation algorithm based on VAD should be able to replace this.
     if len(word_durations) > 0:
-        median_duration = np.median(word_durations)
-        max_duration = median_duration * 2
         sentence_end_marks = ".。!！?？"
         # ensure words at sentence boundaries are not longer than twice the median word duration.
         for i in range(1, len(alignment)):