feat: support gpt-4o-audio-preview

sashabaranov · Nov 11, 2024 · c9c258b · c9c258b
1 parent 6d066bb
commit c9c258b
Show file tree

Hide file tree

Showing 7 changed files with 444 additions and 82 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,7 @@
 
 # Test binary, built with `go test -c`
 *.test
+test.mp3
 
 # Output of the go coverage tool, specifically when used with LiteIDE
 *.out

diff --git a/chat.go b/chat.go
@@ -78,17 +78,63 @@ type ChatMessageImageURL struct {
 	Detail ImageURLDetail `json:"detail,omitempty"`
 }
 
+type AudioVoice string
+
+const (
+	AudioVoiceAlloy   AudioVoice = "alloy"
+	AudioVoiceAsh     AudioVoice = "ash"
+	AudioVoiceBallad  AudioVoice = "ballad"
+	AudioVoiceCoral   AudioVoice = "coral"
+	AudioVoiceEcho    AudioVoice = "echo"
+	AudioVoiceSage    AudioVoice = "sage"
+	AudioVoiceShimmer AudioVoice = "shimmer"
+	AudioVoiceVerse   AudioVoice = "verse"
+)
+
+type AudioFormat string
+
+const (
+	AudioFormatWAV   AudioFormat = "wav"
+	AudioFormatMP3   AudioFormat = "mp3"
+	AudioFormatFLAC  AudioFormat = "flac"
+	AudioFormatOPUS  AudioFormat = "opus"
+	AudioFormatPCM16 AudioFormat = "pcm16"
+)
+
+type ChatMessageAudio struct {
+	// Base64 encoded audio data.
+	Data string `json:"data,omitempty"`
+	// The format of the encoded audio data. Currently supports "wav" and "mp3".
+	Format AudioFormat `json:"format,omitempty"`
+}
+
+type Modality string
+
+const (
+	ModalityAudio Modality = "audio"
+	ModalityText  Modality = "text"
+)
+
+type AudioOutput struct {
+	// The voice the model uses to respond. Supported voices are alloy, ash, ballad, coral, echo, sage, shimmer, and verse.
+	Voice AudioVoice `json:"voice"`
+	// Specifies the output audio format. Must be one of wav, mp3, flac, opus, or pcm16.
+	Format AudioFormat `json:"format"`
+}
+
 type ChatMessagePartType string
 
 const (
-	ChatMessagePartTypeText     ChatMessagePartType = "text"
-	ChatMessagePartTypeImageURL ChatMessagePartType = "image_url"
+	ChatMessagePartTypeText       ChatMessagePartType = "text"
+	ChatMessagePartTypeImageURL   ChatMessagePartType = "image_url"
+	ChatMessagePartTypeInputAudio ChatMessagePartType = "input_audio"
 )
 
 type ChatMessagePart struct {
-	Type     ChatMessagePartType  `json:"type,omitempty"`
-	Text     string               `json:"text,omitempty"`
-	ImageURL *ChatMessageImageURL `json:"image_url,omitempty"`
+	Type       ChatMessagePartType  `json:"type,omitempty"`
+	Text       string               `json:"text,omitempty"`
+	ImageURL   *ChatMessageImageURL `json:"image_url,omitempty"`
+	InputAudio *ChatMessageAudio    `json:"input_audio,omitempty"`
 }
 
 type ChatCompletionMessage struct {
@@ -110,72 +156,74 @@ type ChatCompletionMessage struct {
 
 	// For Role=tool prompts this should be set to the ID given in the assistant's prior request to call a tool.
 	ToolCallID string `json:"tool_call_id,omitempty"`
+
+	// If the audio output modality is requested, this object contains data about the audio response from the model.
+	Audio *ChatCompletionAudio `json:"audio,omitempty"`
+}
+
+type chatCompletionMessageMultiContent struct {
+	Role         string               `json:"role"`
+	Content      string               `json:"-"`
+	Refusal      string               `json:"refusal,omitempty"`
+	MultiContent []ChatMessagePart    `json:"content,omitempty"`
+	Name         string               `json:"name,omitempty"`
+	FunctionCall *FunctionCall        `json:"function_call,omitempty"`
+	ToolCalls    []ToolCall           `json:"tool_calls,omitempty"`
+	ToolCallID   string               `json:"tool_call_id,omitempty"`
+	Audio        *ChatCompletionAudio `json:"audio,omitempty"`
+}
+
+type chatCompletionMessageSingleContent struct {
+	Role         string               `json:"role"`
+	Content      string               `json:"content"`
+	Refusal      string               `json:"refusal,omitempty"`
+	MultiContent []ChatMessagePart    `json:"-"`
+	Name         string               `json:"name,omitempty"`
+	FunctionCall *FunctionCall        `json:"function_call,omitempty"`
+	ToolCalls    []ToolCall           `json:"tool_calls,omitempty"`
+	ToolCallID   string               `json:"tool_call_id,omitempty"`
+	Audio        *ChatCompletionAudio `json:"audio,omitempty"`
 }
 
 func (m ChatCompletionMessage) MarshalJSON() ([]byte, error) {
 	if m.Content != "" && m.MultiContent != nil {
 		return nil, ErrContentFieldsMisused
 	}
 	if len(m.MultiContent) > 0 {
-		msg := struct {
-			Role         string            `json:"role"`
-			Content      string            `json:"-"`
-			Refusal      string            `json:"refusal,omitempty"`
-			MultiContent []ChatMessagePart `json:"content,omitempty"`
-			Name         string            `json:"name,omitempty"`
-			FunctionCall *FunctionCall     `json:"function_call,omitempty"`
-			ToolCalls    []ToolCall        `json:"tool_calls,omitempty"`
-			ToolCallID   string            `json:"tool_call_id,omitempty"`
-		}(m)
+		msg := chatCompletionMessageMultiContent(m)
 		return json.Marshal(msg)
 	}
 
-	msg := struct {
-		Role         string            `json:"role"`
-		Content      string            `json:"content"`
-		Refusal      string            `json:"refusal,omitempty"`
-		MultiContent []ChatMessagePart `json:"-"`
-		Name         string            `json:"name,omitempty"`
-		FunctionCall *FunctionCall     `json:"function_call,omitempty"`
-		ToolCalls    []ToolCall        `json:"tool_calls,omitempty"`
-		ToolCallID   string            `json:"tool_call_id,omitempty"`
-	}(m)
+	msg := chatCompletionMessageSingleContent(m)
 	return json.Marshal(msg)
 }
 
 func (m *ChatCompletionMessage) UnmarshalJSON(bs []byte) error {
-	msg := struct {
-		Role         string `json:"role"`
-		Content      string `json:"content"`
-		Refusal      string `json:"refusal,omitempty"`
-		MultiContent []ChatMessagePart
-		Name         string        `json:"name,omitempty"`
-		FunctionCall *FunctionCall `json:"function_call,omitempty"`
-		ToolCalls    []ToolCall    `json:"tool_calls,omitempty"`
-		ToolCallID   string        `json:"tool_call_id,omitempty"`
-	}{}
+	msg := chatCompletionMessageSingleContent{}
 
 	if err := json.Unmarshal(bs, &msg); err == nil {
 		*m = ChatCompletionMessage(msg)
 		return nil
 	}
-	multiMsg := struct {
-		Role         string `json:"role"`
-		Content      string
-		Refusal      string            `json:"refusal,omitempty"`
-		MultiContent []ChatMessagePart `json:"content"`
-		Name         string            `json:"name,omitempty"`
-		FunctionCall *FunctionCall     `json:"function_call,omitempty"`
-		ToolCalls    []ToolCall        `json:"tool_calls,omitempty"`
-		ToolCallID   string            `json:"tool_call_id,omitempty"`
-	}{}
+	multiMsg := chatCompletionMessageMultiContent{}
 	if err := json.Unmarshal(bs, &multiMsg); err != nil {
 		return err
 	}
 	*m = ChatCompletionMessage(multiMsg)
 	return nil
 }
 
+type ChatCompletionAudio struct {
+	// Unique identifier for this audio response.
+	ID string `json:"id"`
+	// The Unix timestamp (in seconds) for when this audio response will no longer be accessible on the server for use in multi-turn conversations.
+	ExpiresAt int64 `json:"expires_at"`
+	// Base64 encoded audio bytes generated by the model, in the format specified in the request.
+	Data string `json:"data"`
+	// Transcript of the audio generated by the model.
+	Transcript string `json:"transcript"`
+}
+
 type ToolCall struct {
 	// Index is not nil only in chat completion chunk object
 	Index    *int         `json:"index,omitempty"`
@@ -260,6 +308,11 @@ type ChatCompletionRequest struct {
 	Store bool `json:"store,omitempty"`
 	// Metadata to store with the completion.
 	Metadata map[string]string `json:"metadata,omitempty"`
+	// Output types that you would like the model to generate for this request. Most models are capable of generating text, which is the default: ["text"]
+	// The gpt-4o-audio-preview model can also be used to generate audio. To request that this model generate both text and audio responses, you can use: ["text", "audio"]
+	Modalities []Modality `json:"modalities,omitempty"`
+	// Parameters for audio output. Required when audio output is requested with modalities: ["audio"]
+	Audio *AudioOutput `json:"audio,omitempty"`
 }
 
 type StreamOptions struct {

diff --git a/chat_stream.go b/chat_stream.go
@@ -5,12 +5,19 @@ import (
 	"net/http"
 )
 
+type ChatCompletionStreamChoiceDeltaAudio struct {
+	ID         string `json:"id,omitempty"`
+	Transcript string `json:"transcript,omitempty"`
+	Data       string `json:"data,omitempty"`
+}
+
 type ChatCompletionStreamChoiceDelta struct {
-	Content      string        `json:"content,omitempty"`
-	Role         string        `json:"role,omitempty"`
-	FunctionCall *FunctionCall `json:"function_call,omitempty"`
-	ToolCalls    []ToolCall    `json:"tool_calls,omitempty"`
-	Refusal      string        `json:"refusal,omitempty"`
+	Content      string                                `json:"content,omitempty"`
+	Role         string                                `json:"role,omitempty"`
+	FunctionCall *FunctionCall                         `json:"function_call,omitempty"`
+	ToolCalls    []ToolCall                            `json:"tool_calls,omitempty"`
+	Refusal      string                                `json:"refusal,omitempty"`
+	Audio        *ChatCompletionStreamChoiceDeltaAudio `json:"audio,omitempty"`
 }
 
 type ChatCompletionStreamChoiceLogprobs struct {
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,7 @@ @@
     # Test binary, built with `go test -c`
     *.test
+    test.mp3
     # Output of the go coverage tool, specifically when used with LiteIDE
     *.out
@@ Expand Down @@