Fix realtime audio (#129)

truemagic-coder · web-flow · commit c88e079dcab2 · 2025-09-09T16:26:32.000-07:00
* wip

* wip

* wip

* done
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "solana-agent"
-version = "31.2.1"
+version = "31.2.2"
 description = "AI Agents for Solana"
 authors = ["Bevan Hunt <bevan@bevanhunt.com>"]
 license = "MIT"
diff --git a/solana_agent/adapters/openai_realtime_ws.py b/solana_agent/adapters/openai_realtime_ws.py
@@ -961,9 +961,6 @@ async def update_session(
         if audio_patch:
             patch["audio"] = audio_patch
 
-        # Always include session.type in updates
-        patch["type"] = "realtime"
-
         # No top-level turn_detection
 
         def _strip_tool_strict(tools_val):
@@ -1030,7 +1027,8 @@ def _strip_tool_strict(tools_val):
                 )
         except Exception:
             pass
-        await self._send(payload)
+        # Use tracked send to attach an event_id and improve diagnostics
+        await self._send_tracked(payload, label="session.update:patch")
 
     async def append_audio(self, pcm16_bytes: bytes) -> None:  # pragma: no cover
         b64 = base64.b64encode(pcm16_bytes).decode("ascii")
@@ -1045,10 +1043,16 @@ async def append_audio(self, pcm16_bytes: bytes) -> None:  # pragma: no cover
 
     async def commit_input(self) -> None:  # pragma: no cover
         try:
-            # Skip commits while a response is active to avoid server errors
+            # If a previous response is still marked active, wait briefly, then proceed.
+            # Skipping commits here can cause new turns to reference old audio and repeat answers.
             if bool(getattr(self, "_response_active", False)):
-                logger.warning("Realtime WS: skipping commit; response active")
-                return
+                logger.warning(
+                    "Realtime WS: response active at commit; waiting briefly before proceeding"
+                )
+                for _ in range(5):  # up to ~0.5s
+                    await asyncio.sleep(0.1)
+                    if not bool(getattr(self, "_response_active", False)):
+                        break
             # Avoid overlapping commits while awaiting server ack
             if bool(getattr(self, "_commit_inflight", False)):
                 logger.warning("Realtime WS: skipping commit; commit in-flight")
@@ -1250,6 +1254,24 @@ def iter_output_transcript(self) -> AsyncGenerator[str, None]:  # pragma: no cov
     def set_tool_executor(self, executor):  # pragma: no cover
         self._tool_executor = executor
 
+    def reset_output_stream(self) -> None:  # pragma: no cover
+        """Drain any queued output audio and clear per-response text buffers.
+        This avoids replaying stale audio if the client failed to consume previous chunks."""
+        try:
+            while True:
+                try:
+                    _ = self._audio_queue.get_nowait()
+                except asyncio.QueueEmpty:
+                    break
+                except Exception:
+                    break
+            try:
+                self._out_text_buffers.clear()
+            except Exception:
+                pass
+        except Exception:
+            pass
+
     # Expose whether a function/tool call is currently pending
     def has_pending_tool_call(self) -> bool:  # pragma: no cover
         try:
@@ -1611,3 +1633,7 @@ async def _empty():
     def set_tool_executor(self, executor):  # pragma: no cover
         # Not applicable for transcription-only
         return
+
+    def reset_output_stream(self) -> None:  # pragma: no cover
+        # No audio output stream to reset
+        return
diff --git a/solana_agent/services/query.py b/solana_agent/services/query.py
@@ -669,6 +669,12 @@ async def _exec(tool_name: str, args: Dict[str, Any]) -> Dict[str, Any]:
                     await rt.clear_input()
                 except Exception:
                     pass
+                # Also reset any leftover output audio so new turn doesn't replay old chunks
+                try:
+                    if hasattr(rt, "reset_output_stream"):
+                        rt.reset_output_stream()
+                except Exception:
+                    pass
 
                 # Persist once per turn
                 turn_id = await self.realtime_begin_turn(user_id)
diff --git a/solana_agent/services/realtime.py b/solana_agent/services/realtime.py
@@ -185,6 +185,13 @@ def iter_events(self) -> AsyncGenerator[Dict[str, Any], None]:  # pragma: no cov
     def iter_output_audio(self) -> AsyncGenerator[bytes, None]:  # pragma: no cover
         return self._session.iter_output_audio()
 
+    def reset_output_stream(self) -> None:  # pragma: no cover
+        try:
+            if hasattr(self._session, "reset_output_stream"):
+                self._session.reset_output_stream()
+        except Exception:
+            pass
+
     async def iter_output_audio_encoded(
         self,
     ) -> AsyncGenerator[bytes, None]:  # pragma: no cover
@@ -447,6 +454,13 @@ def iter_events(self) -> AsyncGenerator[Dict[str, Any], None]:  # pragma: no cov
     def iter_output_audio(self) -> AsyncGenerator[bytes, None]:  # pragma: no cover
         return self._conv.iter_output_audio()
 
+    def reset_output_stream(self) -> None:  # pragma: no cover
+        try:
+            if hasattr(self._conv, "reset_output_stream"):
+                self._conv.reset_output_stream()
+        except Exception:
+            pass
+
     async def iter_output_audio_encoded(
         self,
     ) -> AsyncGenerator[bytes, None]:  # pragma: no cover