From 12687bc115b329838c83a29fcd83d5118cad32ad Mon Sep 17 00:00:00 2001 From: Bevan Hunt Date: Tue, 9 Sep 2025 22:58:05 -0700 Subject: [PATCH 1/3] update --- pyproject.toml | 2 +- solana_agent/adapters/ffmpeg_transcoder.py | 44 ++++++++++++++++++++-- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 48811934..174cab88 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "solana-agent" -version = "31.2.3" +version = "31.2.4-dev1" description = "AI Agents for Solana" authors = ["Bevan Hunt "] license = "MIT" diff --git a/solana_agent/adapters/ffmpeg_transcoder.py b/solana_agent/adapters/ffmpeg_transcoder.py index 0ce694a9..3916a538 100644 --- a/solana_agent/adapters/ffmpeg_transcoder.py +++ b/solana_agent/adapters/ffmpeg_transcoder.py @@ -4,6 +4,8 @@ import contextlib import logging from typing import List, AsyncGenerator +import tempfile +import os from solana_agent.interfaces.providers.audio import AudioTranscoder @@ -49,11 +51,45 @@ async def to_pcm16( # pragma: no cover rate_hz, len(audio_bytes), ) - # Prefer to hint format for common containers/codecs; ffmpeg can still autodetect if hint is wrong. - hinted_format = None + # iOS-recorded MP4/M4A often requires a seekable input for reliable demuxing. + # Decode from a temporary file instead of stdin for MP4/M4A. if input_mime in ("audio/mp4", "audio/m4a"): - hinted_format = "mp4" - elif input_mime in ("audio/aac",): + suffix = ".m4a" if input_mime == "audio/m4a" else ".mp4" + tmp_path = None + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tf: + tmp_path = tf.name + tf.write(audio_bytes) + args = [ + "-hide_banner", + "-loglevel", + "error", + "-i", + tmp_path, + "-vn", # ignore any video tracks + "-acodec", + "pcm_s16le", + "-ac", + "1", + "-ar", + str(rate_hz), + "-f", + "s16le", + "pipe:1", + ] + out = await self._run_ffmpeg(args, b"") + logger.info( + "Transcoded (MP4/M4A temp-file) to PCM16: output_len=%d", len(out) + ) + return out + finally: + if tmp_path: + with contextlib.suppress(Exception): + os.remove(tmp_path) + + # For other formats, prefer a format hint when helpful and decode from stdin. + hinted_format = None + if input_mime in ("audio/aac",): # Raw AAC is typically in ADTS stream format hinted_format = "adts" elif input_mime in ("audio/ogg", "audio/webm"): From 8219d044a959c02112578ba1710d89f4ebac0938 Mon Sep 17 00:00:00 2001 From: Bevan Hunt Date: Tue, 9 Sep 2025 23:13:15 -0700 Subject: [PATCH 2/3] wip --- README.md | 6 +++--- docs/index.rst | 8 ++++---- pyproject.toml | 2 +- solana_agent/adapters/openai_realtime_ws.py | 9 +++++++++ 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 2c74a90f..63013762 100644 --- a/README.md +++ b/README.md @@ -299,18 +299,18 @@ async def generate(): rt_encode_output=True, rt_voice="marin", output_format="audio", - audio_output_format="m4a", + audio_output_format="mp3", audio_input_format="mp4", ): yield chunk return StreamingResponse( content=generate(), - media_type="audio/mp4", + media_type="audio/mp3", headers={ "Cache-Control": "no-store", "Pragma": "no-cache", - "Content-Disposition": "inline; filename=stream.m4a", + "Content-Disposition": "inline; filename=stream.mp3", "X-Accel-Buffering": "no", }, ) diff --git a/docs/index.rst b/docs/index.rst index 7f3aa7f8..76696a44 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -224,18 +224,18 @@ This example will work using expo-audio on Android and iOS. rt_encode_output=True, rt_voice="marin", output_format="audio", - audio_output_format="m4a", - audio_input_format="mp4", + audio_output_format="mp3", + audio_input_format="m4a", ): yield chunk return StreamingResponse( content=generate(), - media_type="audio/mp4", + media_type="audio/mp3", headers={ "Cache-Control": "no-store", "Pragma": "no-cache", - "Content-Disposition": "inline; filename=stream.m4a", + "Content-Disposition": "inline; filename=stream.mp3", "X-Accel-Buffering": "no", }, ) diff --git a/pyproject.toml b/pyproject.toml index 174cab88..14d4328b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "solana-agent" -version = "31.2.4-dev1" +version = "31.2.4-dev2" description = "AI Agents for Solana" authors = ["Bevan Hunt "] license = "MIT" diff --git a/solana_agent/adapters/openai_realtime_ws.py b/solana_agent/adapters/openai_realtime_ws.py index ec377569..52233e51 100644 --- a/solana_agent/adapters/openai_realtime_ws.py +++ b/solana_agent/adapters/openai_realtime_ws.py @@ -1037,6 +1037,15 @@ def _strip_tool_strict(tools_val): if "tools" in patch: patch["tools"] = _strip_tool_strict(patch["tools"]) # idempotent + # Per server requirements, always include session.type and output_modalities + try: + patch["type"] = "realtime" + # Preserve caller-provided output_modalities if present, otherwise default to audio + if "output_modalities" not in patch: + patch["output_modalities"] = ["audio"] + except Exception: + pass + payload = {"type": "session.update", "session": patch} # Mark awaiting updated and store last patch self._last_session_patch = patch or {} From ff49e783f358cce297e325809cf7f879b8fa9942 Mon Sep 17 00:00:00 2001 From: Bevan Hunt Date: Tue, 9 Sep 2025 23:18:39 -0700 Subject: [PATCH 3/3] done and working on ios and android --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 14d4328b..c20a64cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "solana-agent" -version = "31.2.4-dev2" +version = "31.2.4" description = "AI Agents for Solana" authors = ["Bevan Hunt "] license = "MIT"