Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions docs/models/openai.md
Original file line number Diff line number Diff line change
Expand Up @@ -730,3 +730,33 @@ result = agent.run_sync('What is the capital of France?')
print(result.output)
#> The capital of France is Paris.
```

### Qwen

To use Qwen models via the OpenAI-compatible API from [Alibaba Cloud DashScope](https://www.alibabacloud.com/help/doc-detail/2712576.html), you can set the `QWEN_API_KEY` (or `DASHSCOPE_API_KEY`) environment variable and use [`QwenProvider`][pydantic_ai.providers.qwen.QwenProvider] by name:

```python
from pydantic_ai import Agent

agent = Agent('qwen:qwen-max')
...
```

Or initialise the model and provider directly:

```python
from pydantic_ai import Agent
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.providers.qwen import QwenProvider

model = OpenAIChatModel(
'qwen-max',
provider=QwenProvider(api_key='your-qwen-api-key'),
)
agent = Agent(model)
...
```

The `QwenProvider` uses the international DashScope compatible endpoint `https://dashscope-intl.aliyuncs.com/compatible-mode/v1` by default.

When using **Qwen Omni** models (e.g. `qwen-omni-turbo`), this provider automatically handles audio input using the Data URI format required by the DashScope API.
1 change: 1 addition & 0 deletions pydantic_ai_slim/pydantic_ai/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -817,6 +817,7 @@ def infer_model( # noqa: C901
'litellm',
'nebius',
'ovhcloud',
'qwen',
):
model_kind = 'openai-chat'
elif model_kind in ('google-gla', 'google-vertex'):
Expand Down
14 changes: 12 additions & 2 deletions pydantic_ai_slim/pydantic_ai/models/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -939,7 +939,11 @@ async def _map_user_prompt(self, part: UserPromptPart) -> chat.ChatCompletionUse
content.append(ChatCompletionContentPartImageParam(image_url=image_url, type='image_url'))
elif item.is_audio:
assert item.format in ('wav', 'mp3')
audio = InputAudio(data=base64.b64encode(item.data).decode('utf-8'), format=item.format)
profile = OpenAIModelProfile.from_profile(self.profile)
if profile.openai_chat_audio_input_encoding == 'uri':
audio = InputAudio(data=item.data_uri, format=item.format)
else:
audio = InputAudio(data=base64.b64encode(item.data).decode('utf-8'), format=item.format)
content.append(ChatCompletionContentPartInputAudioParam(input_audio=audio, type='input_audio'))
elif item.is_document:
content.append(
Expand All @@ -959,7 +963,13 @@ async def _map_user_prompt(self, part: UserPromptPart) -> chat.ChatCompletionUse
'wav',
'mp3',
), f'Unsupported audio format: {downloaded_item["data_type"]}'
audio = InputAudio(data=downloaded_item['data'], format=downloaded_item['data_type'])
profile = OpenAIModelProfile.from_profile(self.profile)
if profile.openai_chat_audio_input_encoding == 'uri':
mime_type = item.media_type or f'audio/{downloaded_item["data_type"]}'
data_uri = f'data:{mime_type};base64,{downloaded_item["data"]}'
audio = InputAudio(data=data_uri, format=downloaded_item['data_type'])
else:
audio = InputAudio(data=downloaded_item['data'], format=downloaded_item['data_type'])
content.append(ChatCompletionContentPartInputAudioParam(input_audio=audio, type='input_audio'))
elif isinstance(item, DocumentUrl):
if self._is_text_like_media_type(item.media_type):
Expand Down
7 changes: 7 additions & 0 deletions pydantic_ai_slim/pydantic_ai/profiles/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ class OpenAIModelProfile(ModelProfile):
openai_chat_supports_web_search: bool = False
"""Whether the model supports web search in Chat Completions API."""

openai_chat_audio_input_encoding: Literal['base64', 'uri'] = 'base64'
"""The encoding to use for audio input in Chat Completions requests.

- `'base64'`: Raw base64 encoded string. (Default, used by OpenAI)
- `'uri'`: Data URI (e.g. `data:audio/wav;base64,...`). (Used by Qwen Omni)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should still make it so that this is used automatically for Qwen Omni. If that's only a requirement of Qwen's own ChatCompletions-compatible API, we may want a new provider class that can define its own model_profile method and be used with OpenAIChatModel. We shouldn't set this in the existing qwen_model_profile method as Qwen can also be used with providers that probably do not have this quirk.

"""

openai_supports_encrypted_reasoning_content: bool = False
"""Whether the model supports including encrypted reasoning content in the response."""

Expand Down
4 changes: 4 additions & 0 deletions pydantic_ai_slim/pydantic_ai/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,10 @@ def infer_provider_class(provider: str) -> type[Provider[Any]]: # noqa: C901
from .ovhcloud import OVHcloudProvider

return OVHcloudProvider
elif provider == 'qwen':
from .qwen import QwenProvider

return QwenProvider
elif provider == 'outlines':
from .outlines import OutlinesProvider

Expand Down
86 changes: 86 additions & 0 deletions pydantic_ai_slim/pydantic_ai/providers/qwen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from __future__ import annotations as _annotations

import os
from typing import overload

import httpx
from openai import AsyncOpenAI

from pydantic_ai import ModelProfile
from pydantic_ai.exceptions import UserError
from pydantic_ai.models import cached_async_http_client
from pydantic_ai.profiles.openai import OpenAIJsonSchemaTransformer, OpenAIModelProfile
from pydantic_ai.profiles.qwen import qwen_model_profile
from pydantic_ai.providers import Provider

try:
from openai import AsyncOpenAI
except ImportError as _import_error: # pragma: no cover
raise ImportError(
'Please install the `openai` package to use the Qwen provider, '
'you can use the `openai` optional group — `pip install "pydantic-ai-slim[openai]"`'
) from _import_error


class QwenProvider(Provider[AsyncOpenAI]):
"""Provider for Qwen / DashScope OpenAI-compatible API."""

@property
def name(self) -> str:
return 'qwen'

@property
def base_url(self) -> str:
# Using the international endpoint by default as it's more standard for global users
# Users in China region can override this via passing `openai_client` or implementing logic to check region
return 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1'

@property
def client(self) -> AsyncOpenAI:
return self._client

def model_profile(self, model_name: str) -> ModelProfile | None:
base_profile = qwen_model_profile(model_name)

# Wrap/merge into OpenAIModelProfile
openai_profile = OpenAIModelProfile(json_schema_transformer=OpenAIJsonSchemaTransformer).update(base_profile)

# For Qwen Omni models, force URI audio input encoding
if 'omni' in model_name.lower():
openai_profile = OpenAIModelProfile(openai_chat_audio_input_encoding='uri').update(openai_profile)

return openai_profile

@overload
def __init__(self) -> None: ...

@overload
def __init__(self, *, api_key: str) -> None: ...

@overload
def __init__(self, *, api_key: str, http_client: httpx.AsyncClient) -> None: ...

@overload
def __init__(self, *, openai_client: AsyncOpenAI | None = None) -> None: ...

def __init__(
self,
*,
api_key: str | None = None,
openai_client: AsyncOpenAI | None = None,
http_client: httpx.AsyncClient | None = None,
) -> None:
api_key = api_key or os.getenv('QWEN_API_KEY') or os.getenv('DASHSCOPE_API_KEY')
if not api_key and openai_client is None:
raise UserError(
'Set the `QWEN_API_KEY` (or `DASHSCOPE_API_KEY`) environment variable or pass it via '
'`QwenProvider(api_key=...)` to use the Qwen provider.'
)

if openai_client is not None:
self._client = openai_client
elif http_client is not None:
self._client = AsyncOpenAI(base_url=self.base_url, api_key=api_key, http_client=http_client)
else:
http_client = cached_async_http_client(provider='qwen')
self._client = AsyncOpenAI(base_url=self.base_url, api_key=api_key, http_client=http_client)
167 changes: 167 additions & 0 deletions tests/models/test_openai_audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
from __future__ import annotations as _annotations

import base64
from unittest.mock import patch

import pytest

from pydantic_ai import Agent, AudioUrl, BinaryContent
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.profiles.openai import OpenAIModelProfile
from pydantic_ai.providers.openai import OpenAIProvider

from ..conftest import try_import
from .mock_openai import MockOpenAI, completion_message, get_mock_chat_completion_kwargs

with try_import() as imports_successful:
from openai.types.chat.chat_completion_message import ChatCompletionMessage

pytestmark = [
pytest.mark.skipif(not imports_successful(), reason='openai not installed'),
pytest.mark.anyio,
]


def test_openai_chat_audio_default_base64(allow_model_requests: None):
c = completion_message(ChatCompletionMessage(content='success', role='assistant'))
mock_client = MockOpenAI.create_mock(c)
model = OpenAIChatModel('gpt-4o-audio-preview', provider=OpenAIProvider(openai_client=mock_client))
agent = Agent(model)

# BinaryContent
audio_data = b'fake_audio_data'
binary_audio = BinaryContent(audio_data, media_type='audio/wav')

agent.run_sync(['Process this audio', binary_audio])

request_kwargs = get_mock_chat_completion_kwargs(mock_client)
messages = request_kwargs[0]['messages']
user_message = messages[0]

# Find the input_audio part
audio_part = next(part for part in user_message['content'] if part['type'] == 'input_audio')

# Expect raw base64
expected_data = base64.b64encode(audio_data).decode('utf-8')
assert audio_part['input_audio']['data'] == expected_data
assert audio_part['input_audio']['format'] == 'wav'


def test_openai_chat_audio_uri_encoding(allow_model_requests: None):
c = completion_message(ChatCompletionMessage(content='success', role='assistant'))
mock_client = MockOpenAI.create_mock(c)

# Set profile to use URI encoding
profile = OpenAIModelProfile(openai_chat_audio_input_encoding='uri')
model = OpenAIChatModel('gpt-4o-audio-preview', provider=OpenAIProvider(openai_client=mock_client), profile=profile)
agent = Agent(model)

# BinaryContent
audio_data = b'fake_audio_data'
binary_audio = BinaryContent(audio_data, media_type='audio/wav')

agent.run_sync(['Process this audio', binary_audio])

request_kwargs = get_mock_chat_completion_kwargs(mock_client)
messages = request_kwargs[0]['messages']
user_message = messages[0]

# Find the input_audio part
audio_part = next(part for part in user_message['content'] if part['type'] == 'input_audio')

# Expect Data URI
expected_data = f'data:audio/wav;base64,{base64.b64encode(audio_data).decode("utf-8")}'
assert audio_part['input_audio']['data'] == expected_data
assert audio_part['input_audio']['format'] == 'wav'


async def test_openai_chat_audio_url_default_base64(allow_model_requests: None):
c = completion_message(ChatCompletionMessage(content='success', role='assistant'))
mock_client = MockOpenAI.create_mock(c)
model = OpenAIChatModel('gpt-4o-audio-preview', provider=OpenAIProvider(openai_client=mock_client))
agent = Agent(model)

audio_url = AudioUrl('https://example.com/audio.mp3')

# Mock download_item to return base64 data
fake_base64_data = base64.b64encode(b'fake_downloaded_audio').decode('utf-8')

with patch('pydantic_ai.models.openai.download_item') as mock_download:
mock_download.return_value = {'data': fake_base64_data, 'data_type': 'mp3'}

await agent.run(['Process this audio url', audio_url])

request_kwargs = get_mock_chat_completion_kwargs(mock_client)
messages = request_kwargs[0]['messages']
user_message = messages[0]

# Find the input_audio part
audio_part = next(part for part in user_message['content'] if part['type'] == 'input_audio')

# Expect raw base64 (which is what download_item returns in this mock)
assert audio_part['input_audio']['data'] == fake_base64_data
assert audio_part['input_audio']['format'] == 'mp3'


async def test_openai_chat_audio_url_uri_encoding(allow_model_requests: None):
c = completion_message(ChatCompletionMessage(content='success', role='assistant'))
mock_client = MockOpenAI.create_mock(c)

# Set profile to use URI encoding
profile = OpenAIModelProfile(openai_chat_audio_input_encoding='uri')
model = OpenAIChatModel('gpt-4o-audio-preview', provider=OpenAIProvider(openai_client=mock_client), profile=profile)
agent = Agent(model)

audio_url = AudioUrl('https://example.com/audio.mp3')

# Mock download_item to return base64 data
fake_base64_data = base64.b64encode(b'fake_downloaded_audio').decode('utf-8')

with patch('pydantic_ai.models.openai.download_item') as mock_download:
mock_download.return_value = {'data': fake_base64_data, 'data_type': 'mp3'}

await agent.run(['Process this audio url', audio_url])

request_kwargs = get_mock_chat_completion_kwargs(mock_client)
messages = request_kwargs[0]['messages']
user_message = messages[0]

# Find the input_audio part
audio_part = next(part for part in user_message['content'] if part['type'] == 'input_audio')

# Expect Data URI with correct MIME type for mp3
expected_data = f'data:audio/mpeg;base64,{fake_base64_data}'
assert audio_part['input_audio']['data'] == expected_data
assert audio_part['input_audio']['format'] == 'mp3'


async def test_openai_chat_audio_url_custom_media_type(allow_model_requests: None):
c = completion_message(ChatCompletionMessage(content='success', role='assistant'))
mock_client = MockOpenAI.create_mock(c)

# Set profile to use URI encoding
profile = OpenAIModelProfile(openai_chat_audio_input_encoding='uri')
model = OpenAIChatModel('gpt-4o-audio-preview', provider=OpenAIProvider(openai_client=mock_client), profile=profile)
agent = Agent(model)

# AudioUrl with explicit media_type that differs from default extension mapping
# e.g., .mp3 extension but we want to force a specific weird mime type
audio_url = AudioUrl('https://example.com/audio.mp3', media_type='audio/custom-weird-format')

fake_base64_data = base64.b64encode(b'fake_downloaded_audio').decode('utf-8')

with patch('pydantic_ai.models.openai.download_item') as mock_download:
mock_download.return_value = {'data': fake_base64_data, 'data_type': 'mp3'}

await agent.run(['Process this audio url', audio_url])

request_kwargs = get_mock_chat_completion_kwargs(mock_client)
messages = request_kwargs[0]['messages']
user_message = messages[0]

audio_part = next(part for part in user_message['content'] if part['type'] == 'input_audio')

# Expect Data URI with the CUSTOM MIME type
expected_data = f'data:audio/custom-weird-format;base64,{fake_base64_data}'
assert audio_part['input_audio']['data'] == expected_data
assert audio_part['input_audio']['format'] == 'mp3'
Loading
Loading