diff --git a/packages/ai-jsx/package.json b/packages/ai-jsx/package.json index 55e3e6a3..4f483062 100644 --- a/packages/ai-jsx/package.json +++ b/packages/ai-jsx/package.json @@ -4,7 +4,7 @@ "repository": "fixie-ai/ai-jsx", "bugs": "https://github.com/fixie-ai/ai-jsx/issues", "homepage": "https://ai-jsx.com", - "version": "0.32.0", + "version": "0.33.0", "volta": { "extends": "../../package.json" }, diff --git a/packages/ai-jsx/src/core/conversation.tsx b/packages/ai-jsx/src/core/conversation.tsx index efbe8fe1..1975c575 100644 --- a/packages/ai-jsx/src/core/conversation.tsx +++ b/packages/ai-jsx/src/core/conversation.tsx @@ -285,13 +285,13 @@ export async function renderToConversation( render: AI.ComponentContext['render'], logger?: AI.ComponentContext['logger'], logType?: 'prompt' | 'completion', - cost?: (message: ConversationMessage, render: AI.ComponentContext['render']) => Promise, + cost?: (message: ConversationMessage) => Promise, budget?: number ) { const cachedCosts = new WeakMap, Promise>(); function cachedCost(message: ConversationMessage): Promise { if (!cachedCosts.has(message.element)) { - cachedCosts.set(message.element, cost!(message, render)); + cachedCosts.set(message.element, cost!(message)); } return cachedCosts.get(message.element)!; diff --git a/packages/ai-jsx/src/core/image-gen.tsx b/packages/ai-jsx/src/core/image-gen.tsx index a32ca919..b220e2d4 100644 --- a/packages/ai-jsx/src/core/image-gen.tsx +++ b/packages/ai-jsx/src/core/image-gen.tsx @@ -89,15 +89,6 @@ export function ImageGen({ children, ...props }: ImageGenPropsWithChildren, { ge ); } -export interface GeneratedImageProps { - /** The image URL. */ - url: string; - /** The prompt used for generating the image. Currently only used for debugging. */ - prompt: string; - /** The model used for generating the image. Currently only used for debugging. */ - modelName: string; -} - /** * This component represents an image via a single `url` prop. * It is a wrapper for the output of {@link ImageGen} to allow for first-class support of images. @@ -106,6 +97,17 @@ export interface GeneratedImageProps { * - In terminal-based environments, this component will be rendered as a URL. * - In browser-based environments, this component will be rendered as an `img` tag. */ -export function Image(props: GeneratedImageProps) { +export function Image(props: { + /** The image URL. */ + url: string; + /** The prompt used for generating the image. Currently only used for debugging. */ + prompt?: string; + /** The model used for generating the image. Currently only used for debugging. */ + modelName?: string; + /** The level of detail required. */ + detail?: string; + /** The number of input tokens required. */ + inputTokens?: number; +}) { return props.url; } diff --git a/packages/ai-jsx/src/lib/openai.tsx b/packages/ai-jsx/src/lib/openai.tsx index 5c97f2f6..7ecf3664 100644 --- a/packages/ai-jsx/src/lib/openai.tsx +++ b/packages/ai-jsx/src/lib/openai.tsx @@ -40,11 +40,20 @@ export type ValidChatModel = | 'gpt-4-0125-preview' | 'gpt-4-0613' | 'gpt-4-1106-preview' + | 'gpt-4-1106-vision-preview' | 'gpt-4-32k' | 'gpt-4-32k-0613' | 'gpt-4-turbo' | 'gpt-4-turbo-2024-04-09' - | 'gpt-4-turbo-preview'; + | 'gpt-4-turbo-preview' + | 'gpt-4-vision-preview'; + +const visionModels: Partial> = { + 'gpt-4-1106-vision-preview': true, + 'gpt-4-turbo-2024-04-09': true, + 'gpt-4-turbo': true, + 'gpt-4-vision-preview': true, +}; /** * An OpenAI client that talks to the Azure OpenAI service. @@ -230,6 +239,8 @@ function tokenLimitForChatModel( case 'gpt-4-turbo-preview': case 'gpt-4-turbo-2024-04-09': case 'gpt-4-turbo': + case 'gpt-4-vision-preview': + case 'gpt-4-1106-vision-preview': return 128_000 - functionEstimate - TOKENS_CONSUMED_BY_REPLY_PREFIX; case 'gpt-3.5-turbo-0301': case 'gpt-3.5-turbo-0613': @@ -248,19 +259,70 @@ function tokenLimitForChatModel( } } -export async function tokenCountForConversationMessage( +const imageTokenCost = Symbol('imageTokenCost'); +interface ImagePartWithTokenCost extends OpenAIClient.ChatCompletionContentPartImage { + [imageTokenCost]: () => number; +} +type PromptPart = OpenAIClient.ChatCompletionContentPartText | ImagePartWithTokenCost; + +async function renderWithImages(render: AI.RenderContext['render'], element: Node): Promise { + const textAndImages = await render(element, { stop: (node) => node.tag === Image }); + const content: PromptPart[] = []; + + textAndImages.forEach((node: string | AI.Element>) => { + if (typeof node === 'string') { + const lastContentPart = content.at(-1); + if (lastContentPart?.type === 'text') { + // Merge adjacent text nodes. + lastContentPart.text += node; + } else { + content.push({ type: 'text', text: node }); + } + } else { + content.push({ + type: 'image_url', + image_url: { url: node.props.url, detail: node.props.detail as any }, + [imageTokenCost]() { + if (node.props.inputTokens) { + return node.props.inputTokens; + } + + // https://platform.openai.com/docs/guides/vision/calculating-costs + if (node.props.detail === 'high') { + // Assume 6 tiles. + return 6 * 170 + 85; + } + + // Otherwise assume low detail. + return 85; + }, + }); + } + }); + + return content; +} + +async function tokenCountForConversationMessage( message: ConversationMessage, - render: AI.RenderContext['render'] + render: AI.RenderContext['render'], + includeImages: boolean ): Promise { const TOKENS_PER_MESSAGE = 3; const TOKENS_PER_NAME = 1; switch (message.type) { - case 'user': + case 'user': { + const textAndImages: PromptPart[] = includeImages + ? await renderWithImages(render, message.element) + : [{ type: 'text', text: await render(message.element) }]; return ( TOKENS_PER_MESSAGE + - tokenizer.encode(await render(message.element)).length + + textAndImages + .map((part) => (part.type === 'text' ? tokenizer.encode(part.text).length : part[imageTokenCost]())) + .reduce((a, b) => a + b, 0) + (message.element.props.name ? tokenizer.encode(message.element.props.name).length + TOKENS_PER_NAME : 0) ); + } case 'assistant': case 'system': return TOKENS_PER_MESSAGE + tokenizer.encode(await render(message.element)).length; @@ -368,6 +430,7 @@ export async function* OpenAIChatModel( props: ModelPropsWithChildren & { model: ValidChatModel; logitBias?: Record; + includeImages?: boolean; }, { render, getContext, logger, memo }: AI.ComponentContext ): AI.RenderableStream { @@ -383,13 +446,17 @@ export async function* OpenAIChatModel( const modelTokenLimit = tokenLimitForChatModel(props.model, props.functionDefinitions); const promptTokenLimit = props.maxInputTokens ?? modelTokenLimit - (props.reservedTokens ?? props.maxTokens ?? 0); + const includeImages = props.includeImages ?? props.model in visionModels; + + const tokenCostForMessage = (message: ConversationMessage) => + tokenCountForConversationMessage(message, render, includeImages); const conversationMessages = await renderToConversation( props.children, render, logger, 'prompt', - tokenCountForConversationMessage, + tokenCostForMessage, promptTokenLimit ); @@ -406,11 +473,30 @@ export async function* OpenAIChatModel( role: 'system', content: await render(message.element), }; - case 'user': + case 'user': { + if (includeImages) { + const content = await renderWithImages(render, message.element); + // Prefer to pass as a single string if possible. + if (content.some((part) => part.type !== 'text')) { + return { + role: 'user', + content, + }; + } + + return { + role: 'user', + content: (content as OpenAIClient.ChatCompletionContentPartText[]) + .map((part: OpenAIClient.ChatCompletionContentPartText) => part.text) + .join(''), + }; + } + return { role: 'user', content: await render(message.element), }; + } case 'assistant': return { role: 'assistant', @@ -656,7 +742,7 @@ export async function* OpenAIChatModel( } // Render the completion conversation to log it. - await renderToConversation(outputMessages, render, logger, 'completion', tokenCountForConversationMessage); + await renderToConversation(outputMessages, render, logger, 'completion', tokenCostForMessage); return AI.AppendOnlyStream; } diff --git a/packages/docs/docs/changelog.md b/packages/docs/docs/changelog.md index b522945c..d94720a3 100644 --- a/packages/docs/docs/changelog.md +++ b/packages/docs/docs/changelog.md @@ -1,6 +1,10 @@ # Changelog -## 0.32.0 +## 0.33.0 + +- Add support for passing `` to OpenAI models. + +## [0.32.0](https://github.com/fixie-ai/ai-jsx/tree/e7b3e2e444659a49e04693337c2af023c506fbe6) - Improve rendering performance: - Change `` to cache token costs when remeasuring the same elements diff --git a/packages/examples/package.json b/packages/examples/package.json index 285af4a5..db43c797 100644 --- a/packages/examples/package.json +++ b/packages/examples/package.json @@ -63,6 +63,7 @@ "demo:image-generation": "yarn build && node dist/src/image-generation.js", "demo:shrink": "yarn build && node dist/src/conversation-shrinking.js", "demo:fixie-corpus": "yarn build && node dist/src/fixie-corpus.js", + "demo:vision": "yarn build && node dist/src/vision.js", "view-logs": "cat ai-jsx.log | pino-pretty", "lint": "eslint . --max-warnings 0", "lint:fix": "eslint . --fix", diff --git a/packages/examples/src/vision.tsx b/packages/examples/src/vision.tsx new file mode 100644 index 00000000..5a684234 --- /dev/null +++ b/packages/examples/src/vision.tsx @@ -0,0 +1,23 @@ +import { ChatCompletion, UserMessage } from 'ai-jsx/core/completion'; +import { Image } from 'ai-jsx/core/image-gen'; +import { showInspector } from 'ai-jsx/core/inspector'; +import { OpenAI } from 'ai-jsx/lib/openai'; + +function App() { + return ( + + + + What do the following images have in common? + + + + + + ); +} + +showInspector();