Add support for word-level audio transcription timestamp granularity (#…

…733) * Add support for audio transcription timestamp_granularities word * Fixup multiple timestamp granularities
sashabaranov · May 7, 2024 · 3334a9c · 3334a9c
1 parent c9953a7
commit 3334a9c
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 6 deletions.
diff --git a/audio.go b/audio.go
@@ -27,8 +27,14 @@ const (
  AudioResponseFormatVTT AudioResponseFormat = "vtt"
 )
 
+type TranscriptionTimestampGranularity string
+
+const (
+ TranscriptionTimestampGranularityWord TranscriptionTimestampGranularity = "word"
+ TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
+)
+
 // AudioRequest represents a request structure for audio API.
-// ResponseFormat is not supported for now. We only return JSON text, which may be sufficient.
 type AudioRequest struct {
  Model string
 
@@ -38,10 +44,11 @@ type AudioRequest struct {
  // Reader is an optional io.Reader when you do not want to use an existing file.
  Reader io.Reader
 
- Prompt string // For translation, it should be in English
- Temperature float32
- Language string // For translation, just do not use it. It seems "en" works, not confirmed...
- Format AudioResponseFormat
+ Prompt string
+ Temperature float32
+ Language string // Only for transcription.
+ Format AudioResponseFormat
+ TimestampGranularities []TranscriptionTimestampGranularity // Only for transcription.
 }
 
 // AudioResponse represents a response structure for audio API.
@@ -62,6 +69,11 @@ type AudioResponse struct {
  NoSpeechProb float64 `json:"no_speech_prob"`
  Transient bool `json:"transient"`
  } `json:"segments"`
+ Words []struct {
+ Word string `json:"word"`
+ Start float64 `json:"start"`
+ End float64 `json:"end"`
+ } `json:"words"`
  Text string `json:"text"`
 
  httpHeader
@@ -179,6 +191,15 @@ func audioMultipartForm(request AudioRequest, b utils.FormBuilder) error {
  }
  }
 
+ if len(request.TimestampGranularities) > 0 {
+ for _, tg := range request.TimestampGranularities {
+ err = b.WriteField("timestamp_granularities[]", string(tg))
+ if err != nil {
+ return fmt.Errorf("writing timestamp_granularities[]: %w", err)
+ }
+ }
+ }
+
  // Close the multipart writer
  return b.Close()
 }

diff --git a/audio_api_test.go b/audio_api_test.go
@@ -105,6 +105,10 @@ func TestAudioWithOptionalArgs(t *testing.T) {
  Temperature: 0.5,
  Language: "zh",
  Format: openai.AudioResponseFormatSRT,
+ TimestampGranularities: []openai.TranscriptionTimestampGranularity{
+ openai.TranscriptionTimestampGranularitySegment,
+ openai.TranscriptionTimestampGranularityWord,
+ },
  }
  _, err := tc.createFn(ctx, req)
  checks.NoError(t, err, "audio API error")

diff --git a/audio_test.go b/audio_test.go
@@ -24,6 +24,10 @@ func TestAudioWithFailingFormBuilder(t *testing.T) {
  Temperature: 0.5,
  Language: "en",
  Format: AudioResponseFormatSRT,
+ TimestampGranularities: []TranscriptionTimestampGranularity{
+ TranscriptionTimestampGranularitySegment,
+ TranscriptionTimestampGranularityWord,
+ },
  }
 
  mockFailedErr := fmt.Errorf("mock form builder fail")
@@ -47,7 +51,7 @@ func TestAudioWithFailingFormBuilder(t *testing.T) {
  return nil
  }
 
- failOn := []string{"model", "prompt", "temperature", "language", "response_format"}
+ failOn := []string{"model", "prompt", "temperature", "language", "response_format", "timestamp_granularities[]"}
  for _, failingField := range failOn {
  failForField = failingField
  mockFailedErr = fmt.Errorf("mock form builder fail on field %s", failingField)