fix realtime sessions and ios encoding (#131)

truemagic-coder · web-flow · commit 39a6c0e12b2f · 2025-09-09T23:22:18.000-07:00
* update

* wip

* done and working on ios and android
diff --git a/README.md b/README.md
@@ -299,18 +299,18 @@ async def generate():
         rt_encode_output=True,
         rt_voice="marin",
         output_format="audio",
-        audio_output_format="m4a",
+        audio_output_format="mp3",
         audio_input_format="mp4",
     ):
         yield chunk
 
 return StreamingResponse(
     content=generate(),
-    media_type="audio/mp4",
+    media_type="audio/mp3",
     headers={
         "Cache-Control": "no-store",
         "Pragma": "no-cache",
-        "Content-Disposition": "inline; filename=stream.m4a",
+        "Content-Disposition": "inline; filename=stream.mp3",
         "X-Accel-Buffering": "no",
     },
 )
diff --git a/docs/index.rst b/docs/index.rst
@@ -224,18 +224,18 @@ This example will work using expo-audio on Android and iOS.
          rt_encode_output=True,
          rt_voice="marin",
          output_format="audio",
-         audio_output_format="m4a",
-         audio_input_format="mp4",
+         audio_output_format="mp3",
+         audio_input_format="m4a",
       ):
          yield chunk
 
    return StreamingResponse(
       content=generate(),
-      media_type="audio/mp4",
+      media_type="audio/mp3",
       headers={
          "Cache-Control": "no-store",
          "Pragma": "no-cache",
-         "Content-Disposition": "inline; filename=stream.m4a",
+         "Content-Disposition": "inline; filename=stream.mp3",
          "X-Accel-Buffering": "no",
       },
    )
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "solana-agent"
-version = "31.2.3"
+version = "31.2.4"
 description = "AI Agents for Solana"
 authors = ["Bevan Hunt <bevan@bevanhunt.com>"]
 license = "MIT"
diff --git a/solana_agent/adapters/ffmpeg_transcoder.py b/solana_agent/adapters/ffmpeg_transcoder.py
@@ -4,6 +4,8 @@
 import contextlib
 import logging
 from typing import List, AsyncGenerator
+import tempfile
+import os
 
 from solana_agent.interfaces.providers.audio import AudioTranscoder
 
@@ -49,11 +51,45 @@ async def to_pcm16(  # pragma: no cover
             rate_hz,
             len(audio_bytes),
         )
-        # Prefer to hint format for common containers/codecs; ffmpeg can still autodetect if hint is wrong.
-        hinted_format = None
+        # iOS-recorded MP4/M4A often requires a seekable input for reliable demuxing.
+        # Decode from a temporary file instead of stdin for MP4/M4A.
         if input_mime in ("audio/mp4", "audio/m4a"):
-            hinted_format = "mp4"
-        elif input_mime in ("audio/aac",):
+            suffix = ".m4a" if input_mime == "audio/m4a" else ".mp4"
+            tmp_path = None
+            try:
+                with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tf:
+                    tmp_path = tf.name
+                    tf.write(audio_bytes)
+                args = [
+                    "-hide_banner",
+                    "-loglevel",
+                    "error",
+                    "-i",
+                    tmp_path,
+                    "-vn",  # ignore any video tracks
+                    "-acodec",
+                    "pcm_s16le",
+                    "-ac",
+                    "1",
+                    "-ar",
+                    str(rate_hz),
+                    "-f",
+                    "s16le",
+                    "pipe:1",
+                ]
+                out = await self._run_ffmpeg(args, b"")
+                logger.info(
+                    "Transcoded (MP4/M4A temp-file) to PCM16: output_len=%d", len(out)
+                )
+                return out
+            finally:
+                if tmp_path:
+                    with contextlib.suppress(Exception):
+                        os.remove(tmp_path)
+
+        # For other formats, prefer a format hint when helpful and decode from stdin.
+        hinted_format = None
+        if input_mime in ("audio/aac",):
             # Raw AAC is typically in ADTS stream format
             hinted_format = "adts"
         elif input_mime in ("audio/ogg", "audio/webm"):
diff --git a/solana_agent/adapters/openai_realtime_ws.py b/solana_agent/adapters/openai_realtime_ws.py
@@ -1037,6 +1037,15 @@ def _strip_tool_strict(tools_val):
         if "tools" in patch:
             patch["tools"] = _strip_tool_strict(patch["tools"])  # idempotent
 
+        # Per server requirements, always include session.type and output_modalities
+        try:
+            patch["type"] = "realtime"
+            # Preserve caller-provided output_modalities if present, otherwise default to audio
+            if "output_modalities" not in patch:
+                patch["output_modalities"] = ["audio"]
+        except Exception:
+            pass
+
         payload = {"type": "session.update", "session": patch}
         # Mark awaiting updated and store last patch
         self._last_session_patch = patch or {}