Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for verbose_json for audio transcriptions #199

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 21 additions & 10 deletions Sources/OpenAI/Public/Models/AudioTranscriptionQuery.swift
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,19 @@
import Foundation

public struct AudioTranscriptionQuery: Codable {

public enum TimestampGranularities: String, Codable, Equatable, CaseIterable {
case word
case segment
}

public enum ResponseFormat: String, Codable, Equatable, CaseIterable {
case json
case text
case verboseJson = "verbose_json"
case srt
case vtt
}
public enum ResponseFormat: String, Codable, Equatable, CaseIterable {
case json
case text
case verboseJson = "verbose_json"
case srt
case vtt
}

/// The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
public let file: Data
Expand All @@ -33,15 +38,19 @@ public enum ResponseFormat: String, Codable, Equatable, CaseIterable {
/// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.
/// https://platform.openai.com/docs/guides/speech-to-text/prompting
public let language: String?
/// The timestamp granularities to populate for this transcription. response_format must be set verbose_json to use timestamp granularities. Either or both of these options are supported: word, or segment. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
/// Defaults to segment
public let timestampGranularities: [Self.TimestampGranularities]

public init(file: Data, fileType: Self.FileType, model: Model, prompt: String? = nil, temperature: Double? = nil, language: String? = nil, responseFormat: Self.ResponseFormat? = nil) {
public init(file: Data, fileType: Self.FileType, model: Model, prompt: String? = nil, temperature: Double? = nil, language: String? = nil, responseFormat: Self.ResponseFormat? = nil, timestampGranularities: [Self.TimestampGranularities] = []) {
self.file = file
self.fileType = fileType
self.model = model
self.prompt = prompt
self.temperature = temperature
self.language = language
self.responseFormat = responseFormat
self.timestampGranularities = timestampGranularities
}

public enum FileType: String, Codable, Equatable, CaseIterable {
Expand Down Expand Up @@ -88,8 +97,10 @@ extension AudioTranscriptionQuery: MultipartFormDataBodyEncodable {
.string(paramName: "prompt", value: prompt),
.string(paramName: "temperature", value: temperature),
.string(paramName: "language", value: language),
.string(paramName: "response_format", value: responseFormat)
])
.string(paramName: "response_format", value: responseFormat?.rawValue),
] + timestampGranularities.map({.string(paramName: "timestamp_granularities[]", value: $0)})
)

return bodyBuilder.build()
}
}
63 changes: 62 additions & 1 deletion Sources/OpenAI/Public/Models/AudioTranscriptionResult.swift
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,68 @@
import Foundation

public struct AudioTranscriptionResult: Codable, Equatable {


public struct Word: Codable, Equatable {
/// The text content of the word.
public let word: String
/// Start time of the word in seconds.
public let start: Float
/// End time of the word in seconds.
public let end: Float
}

public struct Segment: Codable, Equatable {
/// Unique identifier of the segment.
public let id: Int
/// Seek offset of the segment.
public let seek: Int
/// Start time of the segment in seconds.
public let start: Float
/// End time of the segment in seconds.
public let end: Float
/// Text content of the segment.
public let text: String
/// Array of token IDs for the text content.
public let tokens: [Int]
/// Temperature parameter used for generating the segment.
public let temperature: Float
/// Average logprob of the segment. If the value is lower than -1, consider the logprobs failed.
public let avgLogprob: Float
/// Compression ratio of the segment. If the value is greater than 2.4, consider the compression failed.
public let compressionRatio: Float
/// Probability of no speech in the segment. If the value is higher than 1.0 and the avg_logprob is below -1, consider this segment silent.
public let noSpeechProb: Float

enum CodingKeys: String, CodingKey {
case id
case seek
case start
case end
case text
case tokens
case temperature
case avgLogprob = "avg_logprob"
case compressionRatio = "compression_ratio"
case noSpeechProb = "no_speech_prob"
}
}

/// The transcribed text.
public let text: String


public let task: String?

/// The language of the input audio.
public let language: String?

/// The duration of the input audio.
public let duration: Float?

/// Extracted words and their corresponding timestamps.
public let words: [Word]?

/// Segments of the transcribed text and their corresponding details.
public let segments: [Segment]?

}