Fix real-time streaming (#130)

truemagic-coder · web-flow · commit 9a7243f469c1 · 2025-09-09T22:29:43.000-07:00
diff --git a/README.md b/README.md
@@ -281,27 +281,39 @@ Due to the overhead of the router (API call) - realtime only supports a single a
 
 Realtime uses MongoDB for memory so Zep is not needed.
 
+This example will work using expo-audio on Android and iOS.
+
 ```python
 from solana_agent import SolanaAgent
 
 solana_agent = SolanaAgent(config=config)
 
-# Example: mobile sends MP4/AAC; server encodes output to AAC
-audio_content = await audio_file.read()  # bytes
-async for audio_chunk in solana_agent.process(
-    "user123",                    # required
-    audio_content,                # required
-    realtime=True,                # optional (default False)
-    output_format="audio",        # required
-    vad=True,                     # enable VAD (optional)
-    rt_encode_input=True,         # accept compressed input (optional)
-    rt_encode_output=True,        # encode output for client (optional)
-    rt_voice="marin"              # the voice to use for interactions (optional)
-    audio_input_format="mp4",     # client transport (optional)
-    audio_output_format="aac"     # client transport (optional)
-):
-    handle_audio(audio_chunk)
-```
+audio_content = await audio_file.read()
+
+async def generate():
+    async for chunk in solana_agent.process(
+        user_id=user_id, 
+        message=audio_content,
+        realtime=True,
+        rt_encode_input=True,
+        rt_encode_output=True,
+        rt_voice="marin",
+        output_format="audio",
+        audio_output_format="m4a",
+        audio_input_format="mp4",
+    ):
+        yield chunk
+
+return StreamingResponse(
+    content=generate(),
+    media_type="audio/mp4",
+    headers={
+        "Cache-Control": "no-store",
+        "Pragma": "no-cache",
+        "Content-Disposition": "inline; filename=stream.m4a",
+        "X-Accel-Buffering": "no",
+    },
+)
 
 ### Image/Text Streaming
 
diff --git a/docs/index.rst b/docs/index.rst
@@ -205,27 +205,40 @@ Due to the overhead of the router (API call) - realtime only supports a single a
 
 Realtime uses MongoDB for memory so Zep is not needed.
 
+This example will work using expo-audio on Android and iOS.
+
 .. code-block:: python
 
    from solana_agent import SolanaAgent
 
    solana_agent = SolanaAgent(config=config)
 
-   # Example: mobile sends MP4/AAC; server encodes output to AAC
-   audio_content = await audio_file.read()  # bytes
-   async for audio_chunk in solana_agent.process(
-      "user123",                    # required
-      audio_content,                # required
-      realtime=True,                # optional (default False)
-      output_format="audio",        # required
-      vad=True,                     # enable VAD (optional)
-      rt_encode_input=True,         # accept compressed input (optional)
-      rt_encode_output=True,        # encode output for client (optional)
-      rt_voice="marin"              # the voice to use for interactions (optional)
-      audio_input_format="mp4",     # client transport (optional)
-      audio_output_format="aac"     # client transport (optional)
-   ):
-      handle_audio(audio_chunk)
+   audio_content = await audio_file.read()
+
+   async def generate():
+      async for chunk in solana_agent.process(
+         user_id=user_id, 
+         message=audio_content,
+         realtime=True,
+         rt_encode_input=True,
+         rt_encode_output=True,
+         rt_voice="marin",
+         output_format="audio",
+         audio_output_format="m4a",
+         audio_input_format="mp4",
+      ):
+         yield chunk
+
+   return StreamingResponse(
+      content=generate(),
+      media_type="audio/mp4",
+      headers={
+         "Cache-Control": "no-store",
+         "Pragma": "no-cache",
+         "Content-Disposition": "inline; filename=stream.m4a",
+         "X-Accel-Buffering": "no",
+      },
+   )
 
 Image/Text Streaming
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "solana-agent"
-version = "31.2.2"
+version = "31.2.3"
 description = "AI Agents for Solana"
 authors = ["Bevan Hunt <bevan@bevanhunt.com>"]
 license = "MIT"
diff --git a/solana_agent/adapters/ffmpeg_transcoder.py b/solana_agent/adapters/ffmpeg_transcoder.py
@@ -88,13 +88,14 @@ async def to_pcm16(  # pragma: no cover
     async def from_pcm16(  # pragma: no cover
         self, pcm16_bytes: bytes, output_mime: str, rate_hz: int
     ) -> bytes:
-        """Encode PCM16LE to desired format (currently AAC ADTS for mobile streaming)."""
+        """Encode PCM16LE to desired format (AAC ADTS, fragmented MP4, or MP3)."""
         logger.info(
             "Encode from PCM16: output_mime=%s, rate_hz=%d, input_len=%d",
             output_mime,
             rate_hz,
             len(pcm16_bytes),
         )
+
         if output_mime in ("audio/mpeg", "audio/mp3"):
             # Encode to MP3 (often better streaming compatibility on mobile)
             args = [
@@ -122,8 +123,9 @@ async def from_pcm16(  # pragma: no cover
                 "Encoded from PCM16 to %s: output_len=%d", output_mime, len(out)
             )
             return out
-        if output_mime in ("audio/aac", "audio/mp4", "audio/m4a"):
-            # Encode to AAC in ADTS stream; clients can play it as AAC.
+
+        if output_mime in ("audio/aac",):
+            # Encode to AAC in ADTS stream; good for streaming over sockets/HTTP chunked
             args = [
                 "-hide_banner",
                 "-loglevel",
@@ -149,6 +151,38 @@ async def from_pcm16(  # pragma: no cover
                 "Encoded from PCM16 to %s: output_len=%d", output_mime, len(out)
             )
             return out
+
+        if output_mime in ("audio/mp4", "audio/m4a"):
+            # Encode to fragmented MP4 (fMP4) with AAC for better iOS compatibility
+            # For streaming, write an initial moov and fragment over stdout.
+            args = [
+                "-hide_banner",
+                "-loglevel",
+                "error",
+                "-f",
+                "s16le",
+                "-ac",
+                "1",
+                "-ar",
+                str(rate_hz),
+                "-i",
+                "pipe:0",
+                "-c:a",
+                "aac",
+                "-b:a",
+                "96k",
+                "-movflags",
+                "+frag_keyframe+empty_moov",
+                "-f",
+                "mp4",
+                "pipe:1",
+            ]
+            out = await self._run_ffmpeg(args, pcm16_bytes)
+            logger.info(
+                "Encoded from PCM16 to %s (fMP4): output_len=%d", output_mime, len(out)
+            )
+            return out
+
         # Default: passthrough
         logger.info("Encode passthrough (no change), output_len=%d", len(pcm16_bytes))
         return pcm16_bytes
@@ -187,7 +221,7 @@ async def stream_from_pcm16(  # pragma: no cover
                 "mp3",
                 "pipe:1",
             ]
-        elif output_mime in ("audio/aac", "audio/mp4", "audio/m4a"):
+        elif output_mime in ("audio/aac",):
             args = [
                 "-hide_banner",
                 "-loglevel",
@@ -208,6 +242,29 @@ async def stream_from_pcm16(  # pragma: no cover
                 "adts",
                 "pipe:1",
             ]
+        elif output_mime in ("audio/mp4", "audio/m4a"):
+            args = [
+                "-hide_banner",
+                "-loglevel",
+                "error",
+                "-f",
+                "s16le",
+                "-ac",
+                "1",
+                "-ar",
+                str(rate_hz),
+                "-i",
+                "pipe:0",
+                "-c:a",
+                "aac",
+                "-b:a",
+                "96k",
+                "-movflags",
+                "+frag_keyframe+empty_moov",
+                "-f",
+                "mp4",
+                "pipe:1",
+            ]
         else:
             # Passthrough streaming: just yield input
             async for chunk in pcm_iter:
diff --git a/solana_agent/adapters/openai_realtime_ws.py b/solana_agent/adapters/openai_realtime_ws.py
@@ -325,7 +325,26 @@ async def _recv_loop(self) -> None:  # pragma: no cover
                             try:
                                 chunk = base64.b64decode(b64)
                                 self._audio_queue.put_nowait(chunk)
-                                logger.info("Audio delta bytes=%d", len(chunk))
+                                # Ownership/response tagging for diagnostics
+                                try:
+                                    owner = getattr(self, "_owner_user_id", None)
+                                except Exception:
+                                    owner = None
+                                try:
+                                    rid = getattr(self, "_active_response_id", None)
+                                except Exception:
+                                    rid = None
+                                try:
+                                    gen = int(getattr(self, "_response_generation", 0))
+                                except Exception:
+                                    gen = None
+                                logger.info(
+                                    "Audio delta bytes=%d owner=%s rid=%s gen=%s",
+                                    len(chunk),
+                                    owner,
+                                    rid,
+                                    gen,
+                                )
                                 try:
                                     # New response detected if we were previously inactive
                                     if not getattr(self, "_response_active", False):
@@ -492,8 +511,25 @@ async def _recv_loop(self) -> None:  # pragma: no cover
                         "response.audio.done",
                     ):
                         # End of audio stream for the response; stop audio iterator but keep WS open for transcripts
+                        try:
+                            owner = getattr(self, "_owner_user_id", None)
+                        except Exception:
+                            owner = None
+                        try:
+                            rid = (data.get("response") or {}).get("id") or getattr(
+                                self, "_active_response_id", None
+                            )
+                        except Exception:
+                            rid = None
+                        try:
+                            gen = int(getattr(self, "_response_generation", 0))
+                        except Exception:
+                            gen = None
                         logger.info(
-                            "Realtime WS: output audio done; ending audio stream"
+                            "Realtime WS: output audio done; owner=%s rid=%s gen=%s",
+                            owner,
+                            rid,
+                            gen,
                         )
                         # If we have a buffered transcript for this response, flush it now
                         try:
diff --git a/solana_agent/services/query.py b/solana_agent/services/query.py
diff --git a/tests/unit/services/test_query_realtime_concurrency.py b/tests/unit/services/test_query_realtime_concurrency.py