Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENG-857] Filter large image messages #190

Merged
merged 6 commits into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 31 additions & 1 deletion log10/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,36 @@ def _process_chunk(self, chunk):
logger.error(f"Failed to insert in log10: {self.partial_log_row} with error {res.text}. Skipping")


# Filter large images from messages, and replace with a text message saying "Image too large to display"
def filter_large_images(messages):
for message in messages:
# Content may be an array of fragments, of text and images.
# If not, it's a single fragment.
if isinstance(message.get("content"), list):
new_content = []
for fragment in message.get("content", ""):
if fragment.get("type") == "image_url":
# If image is more than 4MB, replace with a text message
url = fragment.get("image_url", {}).get("url", "")
if url.startswith("data:image"):
if len(url) > 4e6:
new_content.append(
{
"type": "text",
"text": "Image too large to capture",
}
)
else:
new_content.append(fragment)
else:
new_content.append(fragment)
else:
new_content.append(fragment)
message["content"] = new_content

return messages


def flatten_messages(messages):
flat_messages = []
for message in messages:
Expand Down Expand Up @@ -524,7 +554,7 @@ def _init_log_row(func, *args, **kwargs):
# We may have to flatten messages from their ChatCompletionMessage with nested ChatCompletionMessageToolCall to json serializable format
# Rewrite in-place
if "messages" in kwargs_copy:
kwargs_copy["messages"] = flatten_messages(kwargs_copy["messages"])
kwargs_copy["messages"] = filter_large_images(flatten_messages(kwargs_copy["messages"]))

# kind and request are set based on the module and qualname
# request is based on openai schema
Expand Down
76 changes: 76 additions & 0 deletions tests/test_large_images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import copy

from log10.load import filter_large_images


def test_empty_messages():
assert filter_large_images([]) == []


# Test for regular messages without fragments i.e. content isn't a list.
def test_non_fragment_messages():
messages = [
{"content": "This is a message.", "role": "system"},
{"content": "This is another message.", "role": "user"},
]
assert filter_large_images(copy.deepcopy(messages)) == messages


# Test for a message with a fragment that is not an image.
def test_non_image_fragment():
messages = [
{"content": [{"type": "text", "text": "This is a message."}], "role": "system"},
{
"content": [{"type": "text", "text": "This is another message."}],
"role": "user",
},
]
assert filter_large_images(copy.deepcopy(messages)) == messages


def test_small_image_fragment():
messages = [
{"content": [{"type": "text", "text": "This is a message."}], "role": "system"},
{
"content": [
{
"type": "image_url",
"image_url": {"url": "https://example.com/image.png"},
}
],
"role": "system",
},
{"content": [{"type": "text", "text": "This is a message."}], "role": "system"},
]
assert filter_large_images(copy.deepcopy(messages)) == messages


def test_large_image_fragment():
large_string = "a" * int(4e6)
before_messages = [
{"content": [{"type": "text", "text": "This is a message."}], "role": "system"},
{
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{large_string}"},
}
],
"role": "system",
},
{"content": [{"type": "text", "text": "This is a message."}], "role": "system"},
]
after_messages = [
{"content": [{"type": "text", "text": "This is a message."}], "role": "system"},
{
"content": [
{
"type": "text",
"text": "Image too large to capture",
},
],
"role": "system",
},
{"content": [{"type": "text", "text": "This is a message."}], "role": "system"},
]
assert filter_large_images(before_messages) == after_messages
1 change: 1 addition & 0 deletions tests/test_magentic.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,4 @@ def _llm() -> str: ...

output = _llm()
assert isinstance(output, str)
_LogAssertion(completion_id=session.last_completion_id(), message_content=output).assert_chat_response()
Loading