Skip to content

Commit

Permalink
feat: support gpt-4o-audio-preview
Browse files Browse the repository at this point in the history
  • Loading branch information
WqyJh committed Nov 11, 2024
1 parent 6d066bb commit c9c258b
Show file tree
Hide file tree
Showing 7 changed files with 444 additions and 82 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

# Test binary, built with `go test -c`
*.test
test.mp3

# Output of the go coverage tool, specifically when used with LiteIDE
*.out
Expand Down
143 changes: 98 additions & 45 deletions chat.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,17 +78,63 @@ type ChatMessageImageURL struct {
Detail ImageURLDetail `json:"detail,omitempty"`
}

type AudioVoice string

const (
AudioVoiceAlloy AudioVoice = "alloy"
AudioVoiceAsh AudioVoice = "ash"
AudioVoiceBallad AudioVoice = "ballad"
AudioVoiceCoral AudioVoice = "coral"
AudioVoiceEcho AudioVoice = "echo"
AudioVoiceSage AudioVoice = "sage"
AudioVoiceShimmer AudioVoice = "shimmer"
AudioVoiceVerse AudioVoice = "verse"
)

type AudioFormat string

const (
AudioFormatWAV AudioFormat = "wav"
AudioFormatMP3 AudioFormat = "mp3"
AudioFormatFLAC AudioFormat = "flac"
AudioFormatOPUS AudioFormat = "opus"
AudioFormatPCM16 AudioFormat = "pcm16"
)

type ChatMessageAudio struct {
// Base64 encoded audio data.
Data string `json:"data,omitempty"`
// The format of the encoded audio data. Currently supports "wav" and "mp3".
Format AudioFormat `json:"format,omitempty"`
}

type Modality string

const (
ModalityAudio Modality = "audio"
ModalityText Modality = "text"
)

type AudioOutput struct {
// The voice the model uses to respond. Supported voices are alloy, ash, ballad, coral, echo, sage, shimmer, and verse.
Voice AudioVoice `json:"voice"`
// Specifies the output audio format. Must be one of wav, mp3, flac, opus, or pcm16.
Format AudioFormat `json:"format"`
}

type ChatMessagePartType string

const (
ChatMessagePartTypeText ChatMessagePartType = "text"
ChatMessagePartTypeImageURL ChatMessagePartType = "image_url"
ChatMessagePartTypeText ChatMessagePartType = "text"
ChatMessagePartTypeImageURL ChatMessagePartType = "image_url"
ChatMessagePartTypeInputAudio ChatMessagePartType = "input_audio"
)

type ChatMessagePart struct {
Type ChatMessagePartType `json:"type,omitempty"`
Text string `json:"text,omitempty"`
ImageURL *ChatMessageImageURL `json:"image_url,omitempty"`
Type ChatMessagePartType `json:"type,omitempty"`
Text string `json:"text,omitempty"`
ImageURL *ChatMessageImageURL `json:"image_url,omitempty"`
InputAudio *ChatMessageAudio `json:"input_audio,omitempty"`
}

type ChatCompletionMessage struct {
Expand All @@ -110,72 +156,74 @@ type ChatCompletionMessage struct {

// For Role=tool prompts this should be set to the ID given in the assistant's prior request to call a tool.
ToolCallID string `json:"tool_call_id,omitempty"`

// If the audio output modality is requested, this object contains data about the audio response from the model.
Audio *ChatCompletionAudio `json:"audio,omitempty"`
}

type chatCompletionMessageMultiContent struct {
Role string `json:"role"`
Content string `json:"-"`
Refusal string `json:"refusal,omitempty"`
MultiContent []ChatMessagePart `json:"content,omitempty"`
Name string `json:"name,omitempty"`
FunctionCall *FunctionCall `json:"function_call,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"`
Audio *ChatCompletionAudio `json:"audio,omitempty"`
}

type chatCompletionMessageSingleContent struct {
Role string `json:"role"`
Content string `json:"content"`
Refusal string `json:"refusal,omitempty"`
MultiContent []ChatMessagePart `json:"-"`
Name string `json:"name,omitempty"`
FunctionCall *FunctionCall `json:"function_call,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"`
Audio *ChatCompletionAudio `json:"audio,omitempty"`
}

func (m ChatCompletionMessage) MarshalJSON() ([]byte, error) {
if m.Content != "" && m.MultiContent != nil {
return nil, ErrContentFieldsMisused
}
if len(m.MultiContent) > 0 {
msg := struct {
Role string `json:"role"`
Content string `json:"-"`
Refusal string `json:"refusal,omitempty"`
MultiContent []ChatMessagePart `json:"content,omitempty"`
Name string `json:"name,omitempty"`
FunctionCall *FunctionCall `json:"function_call,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"`
}(m)
msg := chatCompletionMessageMultiContent(m)
return json.Marshal(msg)
}

msg := struct {
Role string `json:"role"`
Content string `json:"content"`
Refusal string `json:"refusal,omitempty"`
MultiContent []ChatMessagePart `json:"-"`
Name string `json:"name,omitempty"`
FunctionCall *FunctionCall `json:"function_call,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"`
}(m)
msg := chatCompletionMessageSingleContent(m)
return json.Marshal(msg)
}

func (m *ChatCompletionMessage) UnmarshalJSON(bs []byte) error {
msg := struct {
Role string `json:"role"`
Content string `json:"content"`
Refusal string `json:"refusal,omitempty"`
MultiContent []ChatMessagePart
Name string `json:"name,omitempty"`
FunctionCall *FunctionCall `json:"function_call,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"`
}{}
msg := chatCompletionMessageSingleContent{}

if err := json.Unmarshal(bs, &msg); err == nil {
*m = ChatCompletionMessage(msg)
return nil
}
multiMsg := struct {
Role string `json:"role"`
Content string
Refusal string `json:"refusal,omitempty"`
MultiContent []ChatMessagePart `json:"content"`
Name string `json:"name,omitempty"`
FunctionCall *FunctionCall `json:"function_call,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
ToolCallID string `json:"tool_call_id,omitempty"`
}{}
multiMsg := chatCompletionMessageMultiContent{}
if err := json.Unmarshal(bs, &multiMsg); err != nil {
return err
}
*m = ChatCompletionMessage(multiMsg)
return nil
}

type ChatCompletionAudio struct {
// Unique identifier for this audio response.
ID string `json:"id"`
// The Unix timestamp (in seconds) for when this audio response will no longer be accessible on the server for use in multi-turn conversations.

Check failure on line 219 in chat.go

View workflow job for this annotation

GitHub Actions / Sanity check

the line is 144 characters long, which exceeds the maximum of 120 characters. (lll)
ExpiresAt int64 `json:"expires_at"`
// Base64 encoded audio bytes generated by the model, in the format specified in the request.
Data string `json:"data"`
// Transcript of the audio generated by the model.
Transcript string `json:"transcript"`
}

type ToolCall struct {
// Index is not nil only in chat completion chunk object
Index *int `json:"index,omitempty"`
Expand Down Expand Up @@ -260,6 +308,11 @@ type ChatCompletionRequest struct {
Store bool `json:"store,omitempty"`
// Metadata to store with the completion.
Metadata map[string]string `json:"metadata,omitempty"`
// Output types that you would like the model to generate for this request. Most models are capable of generating text, which is the default: ["text"]

Check failure on line 311 in chat.go

View workflow job for this annotation

GitHub Actions / Sanity check

the line is 151 characters long, which exceeds the maximum of 120 characters. (lll)
// The gpt-4o-audio-preview model can also be used to generate audio. To request that this model generate both text and audio responses, you can use: ["text", "audio"]

Check failure on line 312 in chat.go

View workflow job for this annotation

GitHub Actions / Sanity check

the line is 168 characters long, which exceeds the maximum of 120 characters. (lll)
Modalities []Modality `json:"modalities,omitempty"`
// Parameters for audio output. Required when audio output is requested with modalities: ["audio"]
Audio *AudioOutput `json:"audio,omitempty"`
}

type StreamOptions struct {
Expand Down
17 changes: 12 additions & 5 deletions chat_stream.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,19 @@ import (
"net/http"
)

type ChatCompletionStreamChoiceDeltaAudio struct {
ID string `json:"id,omitempty"`
Transcript string `json:"transcript,omitempty"`
Data string `json:"data,omitempty"`
}

type ChatCompletionStreamChoiceDelta struct {
Content string `json:"content,omitempty"`
Role string `json:"role,omitempty"`
FunctionCall *FunctionCall `json:"function_call,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
Refusal string `json:"refusal,omitempty"`
Content string `json:"content,omitempty"`
Role string `json:"role,omitempty"`
FunctionCall *FunctionCall `json:"function_call,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
Refusal string `json:"refusal,omitempty"`
Audio *ChatCompletionStreamChoiceDeltaAudio `json:"audio,omitempty"`
}

type ChatCompletionStreamChoiceLogprobs struct {
Expand Down
Loading

0 comments on commit c9c258b

Please sign in to comment.