truemagic-coder · truemagic-coder · Sep 13, 2025 · Sep 13, 2025 · Sep 13, 2025 · Sep 13, 2025
diff --git a/.coveragerc b/.coveragerc
@@ -6,6 +6,7 @@ omit =
     */site-packages/*
     setup.py
     solana_agent/cli.py
+    solana_agent/interfaces/providers/realtime.py  # exclude interface-only module
 
 [report]
 exclude_lines =

diff --git a/README.md b/README.md
@@ -62,6 +62,7 @@ Smart workflows are as easy as combining your tools and prompts.
 * Simple agent definition using JSON
 * Designed for a multi-agent swarm 
 * Fast multi-modal processing of text, audio, and images
+* Dual modality realtime streaming with simultaneous audio and text output
 * Smart workflows that keep flows simple and smart
 * Interact with the Solana blockchain with many useful tools
 * MCP tool usage with first-class support for [Zapier](https://zapier.com/mcp)
@@ -96,7 +97,7 @@ Smart workflows are as easy as combining your tools and prompts.
 **OpenAI**
 * [gpt-4.1](https://platform.openai.com/docs/models/gpt-4.1) (agent & router)
 * [text-embedding-3-large](https://platform.openai.com/docs/models/text-embedding-3-large) (embedding)
-* [gpt-realtime](https://platform.openai.com/docs/models/gpt-realtime) (realtime audio agent)
+* [gpt-realtime](https://platform.openai.com/docs/models/gpt-realtime) (realtime audio agent with dual modality support)
 * [tts-1](https://platform.openai.com/docs/models/tts-1) (audio TTS)
 * [gpt-4o-mini-transcribe](https://platform.openai.com/docs/models/gpt-4o-mini-transcribe) (audio transcription)
 
@@ -245,6 +246,7 @@ async for response in solana_agent.process("user123", "What is the latest news o
 ### Audio/Text Streaming
 
 ```python
+## Realtime Usage
 from solana_agent import SolanaAgent
 
 config = {
@@ -275,28 +277,32 @@ async for response in solana_agent.process("user123", audio_content, audio_input
 
 ### Realtime Audio Streaming
 
-If input and/or output is encoded (compressed) like mp4/aac then you must have `ffmpeg` installed.
+If input and/or output is encoded (compressed) like mp4/mp3 then you must have `ffmpeg` installed.
 
 Due to the overhead of the router (API call) - realtime only supports a single agent setup.
 
 Realtime uses MongoDB for memory so Zep is not needed.
 
+By default, when `realtime=True` and you supply raw/encoded audio bytes as input, the system **always skips the HTTP transcription (STT) path** and relies solely on the realtime websocket session for input transcription. If you don't specify `rt_transcription_model`, a sensible default (`gpt-4o-mini-transcribe`) is auto-selected so you still receive input transcript events with minimal latency.
+
+Implications:
+- `llm_provider.transcribe_audio` is never invoked for realtime turns.
+- Lower end-to-end latency (no duplicate network round trip for STT).
+- Unified transcript sourcing from realtime events.
+- If you explicitly want to disable transcription altogether, send text (not audio bytes) or ignore transcript events client-side.
+
 This example will work using expo-audio on Android and iOS.
 
 ```python
 from solana_agent import SolanaAgent
 
 solana_agent = SolanaAgent(config=config)
-
-audio_content = await audio_file.read()
-
-async def generate():
-    async for chunk in solana_agent.process(
-        user_id=user_id, 
+        user_id="user123", 
         message=audio_content,
         realtime=True,
         rt_encode_input=True,
         rt_encode_output=True,
+        rt_output_modalities=["audio"],
         rt_voice="marin",
         output_format="audio",
         audio_output_format="mp3",
@@ -314,6 +320,106 @@ return StreamingResponse(
         "X-Accel-Buffering": "no",
     },
 )
+```
+
+### Realtime Text Streaming
+
+Due to the overhead of the router (API call) - realtime only supports a single agent setup.
+
+Realtime uses MongoDB for memory so Zep is not needed.
+
+When using realtime with text input, no audio transcription is needed. The same bypass rules apply—HTTP STT is never called in realtime mode.
+
+```python
+from solana_agent import SolanaAgent
+
+solana_agent = SolanaAgent(config=config)
+
+async def generate():
+    async for chunk in solana_agent.process(
+        user_id="user123", 
+        message="What is the latest news on Solana?",
+        realtime=True,
+        rt_output_modalities=["text"],
+    ):
+        yield chunk
+```
+
+### Dual Modality Realtime Streaming
+
+Solana Agent supports **dual modality realtime streaming**, allowing you to stream both audio and text simultaneously from a single realtime session. This enables rich conversational experiences where users can receive both voice responses and text transcripts in real-time.
+
+#### Features
+- **Simultaneous Audio & Text**: Stream both modalities from the same conversation
+- **Flexible Output**: Choose audio-only, text-only, or both modalities
+- **Real-time Demuxing**: Automatically separate audio and text streams
+- **Mobile Optimized**: Works seamlessly with compressed audio formats (MP4/AAC)
+- **Memory Efficient**: Smart buffering and streaming for optimal performance
+
+#### Mobile App Integration Example
+
+```python
+from fastapi import UploadFile
+from fastapi.responses import StreamingResponse
+from solana_agent import SolanaAgent
+from solana_agent.interfaces.providers.realtime import RealtimeChunk
+import base64
+
+solana_agent = SolanaAgent(config=config)
+
+@app.post("/realtime/dual")
+async def realtime_dual_endpoint(audio_file: UploadFile):
+    """
+    Dual modality (audio + text) realtime endpoint using Server-Sent Events (SSE).
+    Emits:
+      event: audio      (base64 encoded audio frames)
+      event: transcript (incremental text)
+    Notes:
+      - Do NOT set output_format when using both modalities.
+      - If only one modality is requested, plain str (text) or raw audio bytes may be yielded instead of RealtimeChunk.
+    """
+    audio_content = await audio_file.read()
+
+    async def event_stream():
+        async for chunk in solana_agent.process(
+            user_id="mobile_user",
+            message=audio_content,
+            realtime=True,
+            rt_encode_input=True,
+            rt_encode_output=True,
+            rt_output_modalities=["audio", "text"],
+            rt_voice="marin",
+            audio_input_format="mp4",
+            audio_output_format="mp3",
+            # Optionally lock transcription model (otherwise default is auto-selected):
+            # rt_transcription_model="gpt-4o-mini-transcribe",
+        ):
+            if isinstance(chunk, RealtimeChunk):
+                if chunk.is_audio and chunk.audio_data:
+                    b64 = base64.b64encode(chunk.audio_data).decode("ascii")
+                    yield f"event: audio\ndata: {b64}\n\n"
+                elif chunk.is_text and chunk.text_data:
+                    # Incremental transcript (not duplicated at finalize)
+                    yield f"event: transcript\ndata: {chunk.text_data}\n\n"
+                continue
+            # (Defensive) fallback: if something else appears
+            if isinstance(chunk, bytes):
+                b64 = base64.b64encode(chunk).decode("ascii")
+                yield f"event: audio\ndata: {b64}\n\n"
+            elif isinstance(chunk, str):
+                yield f"event: transcript\ndata: {chunk}\n\n"
+
+        yield "event: done\ndata: end\n\n"
+
+    return StreamingResponse(
+        event_stream(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-store",
+            "Access-Control-Allow-Origin": "*",
+        },
+    )
+```
 
 ### Image/Text Streaming
 

diff --git a/docs/index.rst b/docs/index.rst
@@ -223,9 +223,10 @@ This example will work using expo-audio on Android and iOS.
          rt_encode_input=True,
          rt_encode_output=True,
          rt_voice="marin",
+         rt_output_modalities=["audio"],
          output_format="audio",
-         audio_output_format="mp3",
          audio_input_format="m4a",
+         audio_output_format="mp3",
       ):
          yield chunk
 
@@ -240,6 +241,108 @@ This example will work using expo-audio on Android and iOS.
       },
    )
 
+Realtime Text Streaming
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Due to the overhead of the router (API call) - realtime only supports a single agent setup.
+
+Realtime uses MongoDB for memory so Zep is not needed.
+
+.. code-block:: python
+
+   from solana_agent import SolanaAgent
+
+   solana_agent = SolanaAgent(config=config)
+
+   async def generate():
+      async for chunk in solana_agent.process(
+         user_id="user123", 
+         message="What is the latest news on Solana?",
+         realtime=True,
+         rt_output_modalities=["text"],
+      ):
+         yield chunk
+
+Dual Modality Realtime Streaming
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Solana Agent now supports **dual modality realtime streaming**, allowing you to stream both audio and text simultaneously from a single realtime session. This enables rich conversational experiences where users can receive both voice responses and text transcripts in real-time.
+
+Features
+^^^^^^^^
+
+- **Simultaneous Audio & Text**: Stream both modalities from the same conversation
+- **Flexible Output**: Choose audio-only, text-only, or both modalities
+- **Real-time Demuxing**: Automatically separate audio and text streams
+- **Mobile Optimized**: Works seamlessly with compressed audio formats (MP4/MP3)
+- **Memory Efficient**: Smart buffering and streaming for optimal performance
+
+Mobile App Integration Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: python
+
+   from fastapi import UploadFile
+   from fastapi.responses import StreamingResponse
+   from solana_agent import SolanaAgent
+   from solana_agent.interfaces.providers.realtime import RealtimeChunk
+   import base64
+
+   solana_agent = SolanaAgent(config=config)
+
+   @app.post("/realtime/dual")
+   async def realtime_dual_endpoint(audio_file: UploadFile):
+      """
+      Dual modality (audio + text) realtime endpoint using Server-Sent Events (SSE).
+      Emits:
+         event: audio      (base64 encoded audio frames)
+         event: transcript (incremental text)
+      Notes:
+         - Do NOT set output_format when using both modalities.
+         - If only one modality is requested, plain str (text) or raw audio bytes may be yielded instead of RealtimeChunk.
+      """
+      audio_content = await audio_file.read()
+
+      async def event_stream():
+         async for chunk in solana_agent.process(
+               user_id="mobile_user",
+               message=audio_content,
+               realtime=True,
+               rt_encode_input=True,
+               rt_encode_output=True,
+               rt_output_modalities=["audio", "text"],
+               rt_voice="marin",
+               audio_input_format="mp4",
+               audio_output_format="mp3",
+               # Optionally lock transcription model (otherwise default is auto-selected):
+               # rt_transcription_model="gpt-4o-mini-transcribe",
+         ):
+               if isinstance(chunk, RealtimeChunk):
+                  if chunk.is_audio and chunk.audio_data:
+                     b64 = base64.b64encode(chunk.audio_data).decode("ascii")
+                     yield f"event: audio\ndata: {b64}\n\n"
+                  elif chunk.is_text and chunk.text_data:
+                     # Incremental transcript (not duplicated at finalize)
+                     yield f"event: transcript\ndata: {chunk.text_data}\n\n"
+                  continue
+               # (Defensive) fallback: if something else appears
+               if isinstance(chunk, bytes):
+                  b64 = base64.b64encode(chunk).decode("ascii")
+                  yield f"event: audio\ndata: {b64}\n\n"
+               elif isinstance(chunk, str):
+                  yield f"event: transcript\ndata: {chunk}\n\n"
+
+         yield "event: done\ndata: end\n\n"
+
+      return StreamingResponse(
+         event_stream(),
+         media_type="text/event-stream",
+         headers={
+               "Cache-Control": "no-store",
+               "Access-Control-Allow-Origin": "*",
+         },
+      )
+
 Image/Text Streaming
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "solana-agent"
-version = "31.2.6"
+version = "31.3.0"
 description = "AI Agents for Solana"
 authors = ["Bevan Hunt <bevan@bevanhunt.com>"]
 license = "MIT"