Improve language detection when using clip_timestamps

SYSTRAN · Jun 6, 2024 · b8cc0fc · b8cc0fc
1 parent 65551c0
commit b8cc0fc
Showing 1 changed file with 15 additions and 6 deletions.
diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
@@ -370,16 +370,25 @@ def transcribe(
  or language_detection_segments < 1
  ):
  language_detection_segments = 1
- seek = 0
+ if isinstance(clip_timestamps, str):
+ start_timestamp = float(clip_timestamps.split(",")[0])
+ else:
+ start_timestamp = clip_timestamps[0]
  detected_language_info = {}
+ seek = int(start_timestamp * self.frames_per_second)
  content_frames = (
  features.shape[-1] - self.feature_extractor.nb_max_frames
  )
- while (
- seek <= content_frames
- and seek
- < self.feature_extractor.nb_max_frames * language_detection_segments
- ):
+ # If seek is beyond the full frames, set it to the last segment
+ if seek >= features.shape[-1]:
+ seek = content_frames
+ end_frames = min(
+ seek
+ + self.feature_extractor.nb_max_frames
+ * language_detection_segments,
+ features.shape[-1],
+ )
+ while seek < end_frames:
  segment = features[
  :, seek : seek + self.feature_extractor.nb_max_frames
  ]