Skip to content

Commit

Permalink
Add support for word-level audio transcription timestamp granularity (#…
Browse files Browse the repository at this point in the history
…733)

* Add support for audio transcription timestamp_granularities word

* Fixup multiple timestamp granularities
  • Loading branch information
agcom committed May 7, 2024
1 parent c9953a7 commit 3334a9c
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 6 deletions.
31 changes: 26 additions & 5 deletions audio.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,14 @@ const (
AudioResponseFormatVTT AudioResponseFormat = "vtt"
)

type TranscriptionTimestampGranularity string

const (
TranscriptionTimestampGranularityWord TranscriptionTimestampGranularity = "word"
TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
)

// AudioRequest represents a request structure for audio API.
// ResponseFormat is not supported for now. We only return JSON text, which may be sufficient.
type AudioRequest struct {
Model string

Expand All @@ -38,10 +44,11 @@ type AudioRequest struct {
// Reader is an optional io.Reader when you do not want to use an existing file.
Reader io.Reader

Prompt string // For translation, it should be in English
Temperature float32
Language string // For translation, just do not use it. It seems "en" works, not confirmed...
Format AudioResponseFormat
Prompt string
Temperature float32
Language string // Only for transcription.
Format AudioResponseFormat
TimestampGranularities []TranscriptionTimestampGranularity // Only for transcription.
}

// AudioResponse represents a response structure for audio API.
Expand All @@ -62,6 +69,11 @@ type AudioResponse struct {
NoSpeechProb float64 `json:"no_speech_prob"`
Transient bool `json:"transient"`
} `json:"segments"`
Words []struct {
Word string `json:"word"`
Start float64 `json:"start"`
End float64 `json:"end"`
} `json:"words"`
Text string `json:"text"`

httpHeader
Expand Down Expand Up @@ -179,6 +191,15 @@ func audioMultipartForm(request AudioRequest, b utils.FormBuilder) error {
}
}

if len(request.TimestampGranularities) > 0 {
for _, tg := range request.TimestampGranularities {
err = b.WriteField("timestamp_granularities[]", string(tg))
if err != nil {
return fmt.Errorf("writing timestamp_granularities[]: %w", err)
}
}
}

// Close the multipart writer
return b.Close()
}
Expand Down
4 changes: 4 additions & 0 deletions audio_api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ func TestAudioWithOptionalArgs(t *testing.T) {
Temperature: 0.5,
Language: "zh",
Format: openai.AudioResponseFormatSRT,
TimestampGranularities: []openai.TranscriptionTimestampGranularity{
openai.TranscriptionTimestampGranularitySegment,
openai.TranscriptionTimestampGranularityWord,
},
}
_, err := tc.createFn(ctx, req)
checks.NoError(t, err, "audio API error")
Expand Down
6 changes: 5 additions & 1 deletion audio_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ func TestAudioWithFailingFormBuilder(t *testing.T) {
Temperature: 0.5,
Language: "en",
Format: AudioResponseFormatSRT,
TimestampGranularities: []TranscriptionTimestampGranularity{
TranscriptionTimestampGranularitySegment,
TranscriptionTimestampGranularityWord,
},
}

mockFailedErr := fmt.Errorf("mock form builder fail")
Expand All @@ -47,7 +51,7 @@ func TestAudioWithFailingFormBuilder(t *testing.T) {
return nil
}

failOn := []string{"model", "prompt", "temperature", "language", "response_format"}
failOn := []string{"model", "prompt", "temperature", "language", "response_format", "timestamp_granularities[]"}
for _, failingField := range failOn {
failForField = failingField
mockFailedErr = fmt.Errorf("mock form builder fail on field %s", failingField)
Expand Down

0 comments on commit 3334a9c

Please sign in to comment.