Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions pydantic_ai_slim/pydantic_ai/models/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -939,7 +939,11 @@ async def _map_user_prompt(self, part: UserPromptPart) -> chat.ChatCompletionUse
content.append(ChatCompletionContentPartImageParam(image_url=image_url, type='image_url'))
elif item.is_audio:
assert item.format in ('wav', 'mp3')
audio = InputAudio(data=base64.b64encode(item.data).decode('utf-8'), format=item.format)
profile = OpenAIModelProfile.from_profile(self.profile)
if profile.openai_audio_input_encoding == 'uri':
audio = InputAudio(data=item.data_uri, format=item.format)
else:
audio = InputAudio(data=base64.b64encode(item.data).decode('utf-8'), format=item.format)
content.append(ChatCompletionContentPartInputAudioParam(input_audio=audio, type='input_audio'))
elif item.is_document:
content.append(
Expand All @@ -959,7 +963,16 @@ async def _map_user_prompt(self, part: UserPromptPart) -> chat.ChatCompletionUse
'wav',
'mp3',
), f'Unsupported audio format: {downloaded_item["data_type"]}'
audio = InputAudio(data=downloaded_item['data'], format=downloaded_item['data_type'])
profile = OpenAIModelProfile.from_profile(self.profile)
if profile.openai_audio_input_encoding == 'uri':
format_to_mime = {'wav': 'audio/wav', 'mp3': 'audio/mpeg'}
mime_type = format_to_mime.get(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can use item.media_type right?

Copy link
Contributor Author

@Pavanmanikanta98 Pavanmanikanta98 Dec 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, good point — I can use item.media_type here instead of maintaining my own format_to_mime mapping. I’ll update the AudioUrl handling in _map_user_prompt to construct the data URI using item.media_type, with a simple 'fallback' if it’s missing.

downloaded_item['data_type'], f'audio/{downloaded_item["data_type"]}'
)
data_uri = f'data:{mime_type};base64,{downloaded_item["data"]}'
audio = InputAudio(data=data_uri, format=downloaded_item['data_type'])
else:
audio = InputAudio(data=downloaded_item['data'], format=downloaded_item['data_type'])
content.append(ChatCompletionContentPartInputAudioParam(input_audio=audio, type='input_audio'))
elif isinstance(item, DocumentUrl):
if self._is_text_like_media_type(item.media_type):
Expand Down
7 changes: 7 additions & 0 deletions pydantic_ai_slim/pydantic_ai/profiles/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ class OpenAIModelProfile(ModelProfile):
openai_chat_supports_web_search: bool = False
"""Whether the model supports web search in Chat Completions API."""

openai_audio_input_encoding: Literal['base64', 'uri'] = 'base64'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is specific to OpenAIChatModel and doesn't affect OpenAIResponsesModel so let's prefix with openai_chat_

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense — this is only used by OpenAIChatModel. I’ll rename the profile field to openai_chat_audio_input_encoding and update the chat mapping to use that, so it’s clearly scoped to Chat Completions and doesn’t imply anything about OpenAIResponsesModel.

"""The encoding to use for audio input.

- `'base64'`: Raw base64 encoded string. (Default, used by OpenAI)
- `'uri'`: Data URI (e.g. `data:audio/wav;base64,...`). (Used by Qwen Omni)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should still make it so that this is used automatically for Qwen Omni. If that's only a requirement of Qwen's own ChatCompletions-compatible API, we may want a new provider class that can define its own model_profile method and be used with OpenAIChatModel. We shouldn't set this in the existing qwen_model_profile method as Qwen can also be used with providers that probably do not have this quirk.

"""

openai_supports_encrypted_reasoning_content: bool = False
"""Whether the model supports including encrypted reasoning content in the response."""

Expand Down
135 changes: 135 additions & 0 deletions tests/models/test_openai_audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
from __future__ import annotations as _annotations

import base64
from unittest.mock import patch

import pytest

from pydantic_ai import Agent, AudioUrl, BinaryContent
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.profiles.openai import OpenAIModelProfile
from pydantic_ai.providers.openai import OpenAIProvider

from ..conftest import try_import
from .mock_openai import MockOpenAI, completion_message, get_mock_chat_completion_kwargs

with try_import() as imports_successful:
from openai.types.chat.chat_completion_message import ChatCompletionMessage

pytestmark = [
pytest.mark.skipif(not imports_successful(), reason='openai not installed'),
pytest.mark.anyio,
]


def test_openai_chat_audio_default_base64(allow_model_requests: None):
c = completion_message(ChatCompletionMessage(content='success', role='assistant'))
mock_client = MockOpenAI.create_mock(c)
model = OpenAIChatModel('gpt-4o-audio-preview', provider=OpenAIProvider(openai_client=mock_client))
agent = Agent(model)

# BinaryContent
audio_data = b'fake_audio_data'
binary_audio = BinaryContent(audio_data, media_type='audio/wav')

agent.run_sync(['Process this audio', binary_audio])

request_kwargs = get_mock_chat_completion_kwargs(mock_client)
messages = request_kwargs[0]['messages']
user_message = messages[0]

# Find the input_audio part
audio_part = next(part for part in user_message['content'] if part['type'] == 'input_audio')

# Expect raw base64
expected_data = base64.b64encode(audio_data).decode('utf-8')
assert audio_part['input_audio']['data'] == expected_data
assert audio_part['input_audio']['format'] == 'wav'


def test_openai_chat_audio_uri_encoding(allow_model_requests: None):
c = completion_message(ChatCompletionMessage(content='success', role='assistant'))
mock_client = MockOpenAI.create_mock(c)

# Set profile to use URI encoding
profile = OpenAIModelProfile(openai_audio_input_encoding='uri')
model = OpenAIChatModel('gpt-4o-audio-preview', provider=OpenAIProvider(openai_client=mock_client), profile=profile)
agent = Agent(model)

# BinaryContent
audio_data = b'fake_audio_data'
binary_audio = BinaryContent(audio_data, media_type='audio/wav')

agent.run_sync(['Process this audio', binary_audio])

request_kwargs = get_mock_chat_completion_kwargs(mock_client)
messages = request_kwargs[0]['messages']
user_message = messages[0]

# Find the input_audio part
audio_part = next(part for part in user_message['content'] if part['type'] == 'input_audio')

# Expect Data URI
expected_data = f'data:audio/wav;base64,{base64.b64encode(audio_data).decode("utf-8")}'
assert audio_part['input_audio']['data'] == expected_data
assert audio_part['input_audio']['format'] == 'wav'


async def test_openai_chat_audio_url_default_base64(allow_model_requests: None):
c = completion_message(ChatCompletionMessage(content='success', role='assistant'))
mock_client = MockOpenAI.create_mock(c)
model = OpenAIChatModel('gpt-4o-audio-preview', provider=OpenAIProvider(openai_client=mock_client))
agent = Agent(model)

audio_url = AudioUrl('https://example.com/audio.mp3')

# Mock download_item to return base64 data
fake_base64_data = base64.b64encode(b'fake_downloaded_audio').decode('utf-8')

with patch('pydantic_ai.models.openai.download_item') as mock_download:
mock_download.return_value = {'data': fake_base64_data, 'data_type': 'mp3'}

await agent.run(['Process this audio url', audio_url])

request_kwargs = get_mock_chat_completion_kwargs(mock_client)
messages = request_kwargs[0]['messages']
user_message = messages[0]

# Find the input_audio part
audio_part = next(part for part in user_message['content'] if part['type'] == 'input_audio')

# Expect raw base64 (which is what download_item returns in this mock)
assert audio_part['input_audio']['data'] == fake_base64_data
assert audio_part['input_audio']['format'] == 'mp3'


async def test_openai_chat_audio_url_uri_encoding(allow_model_requests: None):
c = completion_message(ChatCompletionMessage(content='success', role='assistant'))
mock_client = MockOpenAI.create_mock(c)

# Set profile to use URI encoding
profile = OpenAIModelProfile(openai_audio_input_encoding='uri')
model = OpenAIChatModel('gpt-4o-audio-preview', provider=OpenAIProvider(openai_client=mock_client), profile=profile)
agent = Agent(model)

audio_url = AudioUrl('https://example.com/audio.mp3')

# Mock download_item to return base64 data
fake_base64_data = base64.b64encode(b'fake_downloaded_audio').decode('utf-8')

with patch('pydantic_ai.models.openai.download_item') as mock_download:
mock_download.return_value = {'data': fake_base64_data, 'data_type': 'mp3'}

await agent.run(['Process this audio url', audio_url])

request_kwargs = get_mock_chat_completion_kwargs(mock_client)
messages = request_kwargs[0]['messages']
user_message = messages[0]

# Find the input_audio part
audio_part = next(part for part in user_message['content'] if part['type'] == 'input_audio')

# Expect Data URI with correct MIME type for mp3
expected_data = f'data:audio/mpeg;base64,{fake_base64_data}'
assert audio_part['input_audio']['data'] == expected_data
assert audio_part['input_audio']['format'] == 'mp3'