-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Add configurable audio encoding for OpenAI models (Data URI support) #3596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,6 +41,13 @@ class OpenAIModelProfile(ModelProfile): | |
| openai_chat_supports_web_search: bool = False | ||
| """Whether the model supports web search in Chat Completions API.""" | ||
|
|
||
| openai_audio_input_encoding: Literal['base64', 'uri'] = 'base64' | ||
|
||
| """The encoding to use for audio input. | ||
|
|
||
| - `'base64'`: Raw base64 encoded string. (Default, used by OpenAI) | ||
| - `'uri'`: Data URI (e.g. `data:audio/wav;base64,...`). (Used by Qwen Omni) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should still make it so that this is used automatically for Qwen Omni. If that's only a requirement of Qwen's own ChatCompletions-compatible API, we may want a new provider class that can define its own |
||
| """ | ||
|
|
||
| openai_supports_encrypted_reasoning_content: bool = False | ||
| """Whether the model supports including encrypted reasoning content in the response.""" | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,135 @@ | ||
| from __future__ import annotations as _annotations | ||
|
|
||
| import base64 | ||
| from unittest.mock import patch | ||
|
|
||
| import pytest | ||
|
|
||
| from pydantic_ai import Agent, AudioUrl, BinaryContent | ||
| from pydantic_ai.models.openai import OpenAIChatModel | ||
| from pydantic_ai.profiles.openai import OpenAIModelProfile | ||
| from pydantic_ai.providers.openai import OpenAIProvider | ||
|
|
||
| from ..conftest import try_import | ||
| from .mock_openai import MockOpenAI, completion_message, get_mock_chat_completion_kwargs | ||
|
|
||
| with try_import() as imports_successful: | ||
| from openai.types.chat.chat_completion_message import ChatCompletionMessage | ||
|
|
||
| pytestmark = [ | ||
| pytest.mark.skipif(not imports_successful(), reason='openai not installed'), | ||
| pytest.mark.anyio, | ||
| ] | ||
|
|
||
|
|
||
| def test_openai_chat_audio_default_base64(allow_model_requests: None): | ||
| c = completion_message(ChatCompletionMessage(content='success', role='assistant')) | ||
| mock_client = MockOpenAI.create_mock(c) | ||
| model = OpenAIChatModel('gpt-4o-audio-preview', provider=OpenAIProvider(openai_client=mock_client)) | ||
| agent = Agent(model) | ||
|
|
||
| # BinaryContent | ||
| audio_data = b'fake_audio_data' | ||
| binary_audio = BinaryContent(audio_data, media_type='audio/wav') | ||
|
|
||
| agent.run_sync(['Process this audio', binary_audio]) | ||
|
|
||
| request_kwargs = get_mock_chat_completion_kwargs(mock_client) | ||
| messages = request_kwargs[0]['messages'] | ||
| user_message = messages[0] | ||
|
|
||
| # Find the input_audio part | ||
| audio_part = next(part for part in user_message['content'] if part['type'] == 'input_audio') | ||
|
|
||
| # Expect raw base64 | ||
| expected_data = base64.b64encode(audio_data).decode('utf-8') | ||
| assert audio_part['input_audio']['data'] == expected_data | ||
| assert audio_part['input_audio']['format'] == 'wav' | ||
|
|
||
|
|
||
| def test_openai_chat_audio_uri_encoding(allow_model_requests: None): | ||
| c = completion_message(ChatCompletionMessage(content='success', role='assistant')) | ||
| mock_client = MockOpenAI.create_mock(c) | ||
|
|
||
| # Set profile to use URI encoding | ||
| profile = OpenAIModelProfile(openai_audio_input_encoding='uri') | ||
| model = OpenAIChatModel('gpt-4o-audio-preview', provider=OpenAIProvider(openai_client=mock_client), profile=profile) | ||
| agent = Agent(model) | ||
|
|
||
| # BinaryContent | ||
| audio_data = b'fake_audio_data' | ||
| binary_audio = BinaryContent(audio_data, media_type='audio/wav') | ||
|
|
||
| agent.run_sync(['Process this audio', binary_audio]) | ||
|
|
||
| request_kwargs = get_mock_chat_completion_kwargs(mock_client) | ||
| messages = request_kwargs[0]['messages'] | ||
| user_message = messages[0] | ||
|
|
||
| # Find the input_audio part | ||
| audio_part = next(part for part in user_message['content'] if part['type'] == 'input_audio') | ||
|
|
||
| # Expect Data URI | ||
| expected_data = f'data:audio/wav;base64,{base64.b64encode(audio_data).decode("utf-8")}' | ||
| assert audio_part['input_audio']['data'] == expected_data | ||
| assert audio_part['input_audio']['format'] == 'wav' | ||
|
|
||
|
|
||
| async def test_openai_chat_audio_url_default_base64(allow_model_requests: None): | ||
| c = completion_message(ChatCompletionMessage(content='success', role='assistant')) | ||
| mock_client = MockOpenAI.create_mock(c) | ||
| model = OpenAIChatModel('gpt-4o-audio-preview', provider=OpenAIProvider(openai_client=mock_client)) | ||
| agent = Agent(model) | ||
|
|
||
| audio_url = AudioUrl('https://example.com/audio.mp3') | ||
|
|
||
| # Mock download_item to return base64 data | ||
| fake_base64_data = base64.b64encode(b'fake_downloaded_audio').decode('utf-8') | ||
|
|
||
| with patch('pydantic_ai.models.openai.download_item') as mock_download: | ||
| mock_download.return_value = {'data': fake_base64_data, 'data_type': 'mp3'} | ||
|
|
||
| await agent.run(['Process this audio url', audio_url]) | ||
|
|
||
| request_kwargs = get_mock_chat_completion_kwargs(mock_client) | ||
| messages = request_kwargs[0]['messages'] | ||
| user_message = messages[0] | ||
|
|
||
| # Find the input_audio part | ||
| audio_part = next(part for part in user_message['content'] if part['type'] == 'input_audio') | ||
|
|
||
| # Expect raw base64 (which is what download_item returns in this mock) | ||
| assert audio_part['input_audio']['data'] == fake_base64_data | ||
| assert audio_part['input_audio']['format'] == 'mp3' | ||
|
|
||
|
|
||
| async def test_openai_chat_audio_url_uri_encoding(allow_model_requests: None): | ||
| c = completion_message(ChatCompletionMessage(content='success', role='assistant')) | ||
| mock_client = MockOpenAI.create_mock(c) | ||
|
|
||
| # Set profile to use URI encoding | ||
| profile = OpenAIModelProfile(openai_audio_input_encoding='uri') | ||
| model = OpenAIChatModel('gpt-4o-audio-preview', provider=OpenAIProvider(openai_client=mock_client), profile=profile) | ||
| agent = Agent(model) | ||
|
|
||
| audio_url = AudioUrl('https://example.com/audio.mp3') | ||
|
|
||
| # Mock download_item to return base64 data | ||
| fake_base64_data = base64.b64encode(b'fake_downloaded_audio').decode('utf-8') | ||
|
|
||
| with patch('pydantic_ai.models.openai.download_item') as mock_download: | ||
| mock_download.return_value = {'data': fake_base64_data, 'data_type': 'mp3'} | ||
|
|
||
| await agent.run(['Process this audio url', audio_url]) | ||
|
|
||
| request_kwargs = get_mock_chat_completion_kwargs(mock_client) | ||
| messages = request_kwargs[0]['messages'] | ||
| user_message = messages[0] | ||
|
|
||
| # Find the input_audio part | ||
| audio_part = next(part for part in user_message['content'] if part['type'] == 'input_audio') | ||
|
|
||
| # Expect Data URI with correct MIME type for mp3 | ||
| expected_data = f'data:audio/mpeg;base64,{fake_base64_data}' | ||
| assert audio_part['input_audio']['data'] == expected_data | ||
| assert audio_part['input_audio']['format'] == 'mp3' |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can use
item.media_typeright?Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, good point — I can use
item.media_typehere instead of maintaining my ownformat_to_mimemapping. I’ll update the AudioUrl handling in _map_user_prompt to construct the data URI using item.media_type, with a simple 'fallback' if it’s missing.