Skip to content

Commit bef8731

Browse files
deploy: 61d1907
1 parent c5095d8 commit bef8731

File tree

8 files changed

+215
-6
lines changed

8 files changed

+215
-6
lines changed

.doctrees/api/index.doctree

2.78 KB
Binary file not shown.

.doctrees/environment.pickle

1.99 KB
Binary file not shown.

.doctrees/index.doctree

10.3 KB
Binary file not shown.

_modules/solana_agent/client/solana_agent.html

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ <h1>Source code for solana_agent.client.solana_agent</h1><div class="highlight">
8787
<span class="kn">from</span><span class="w"> </span><span class="nn">solana_agent.interfaces.plugins.plugins</span><span class="w"> </span><span class="kn">import</span> <span class="n">Tool</span>
8888
<span class="kn">from</span><span class="w"> </span><span class="nn">solana_agent.services.knowledge_base</span><span class="w"> </span><span class="kn">import</span> <span class="n">KnowledgeBaseService</span>
8989
<span class="kn">from</span><span class="w"> </span><span class="nn">solana_agent.interfaces.services.routing</span><span class="w"> </span><span class="kn">import</span> <span class="n">RoutingService</span> <span class="k">as</span> <span class="n">RoutingInterface</span>
90+
<span class="kn">from</span><span class="w"> </span><span class="nn">solana_agent.interfaces.providers.realtime</span><span class="w"> </span><span class="kn">import</span> <span class="n">RealtimeChunk</span>
9091

9192

9293
<div class="viewcode-block" id="SolanaAgent">
@@ -132,6 +133,7 @@ <h1>Source code for solana_agent.client.solana_agent</h1><div class="highlight">
132133
<span class="n">vad</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
133134
<span class="n">rt_encode_input</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
134135
<span class="n">rt_encode_output</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
136+
<span class="n">rt_output_modalities</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Literal</span><span class="p">[</span><span class="s2">&quot;audio&quot;</span><span class="p">,</span> <span class="s2">&quot;text&quot;</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
135137
<span class="n">rt_voice</span><span class="p">:</span> <span class="n">Literal</span><span class="p">[</span>
136138
<span class="s2">&quot;alloy&quot;</span><span class="p">,</span>
137139
<span class="s2">&quot;ash&quot;</span><span class="p">,</span>
@@ -165,7 +167,9 @@ <h1>Source code for solana_agent.client.solana_agent</h1><div class="highlight">
165167
<span class="n">router</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">RoutingInterface</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
166168
<span class="n">images</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">bytes</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
167169
<span class="n">output_model</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Type</span><span class="p">[</span><span class="n">BaseModel</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
168-
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">AsyncGenerator</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">bytes</span><span class="p">,</span> <span class="n">BaseModel</span><span class="p">],</span> <span class="kc">None</span><span class="p">]:</span> <span class="c1"># pragma: no cover</span>
170+
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">AsyncGenerator</span><span class="p">[</span>
171+
<span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">bytes</span><span class="p">,</span> <span class="n">BaseModel</span><span class="p">,</span> <span class="n">RealtimeChunk</span><span class="p">],</span> <span class="kc">None</span>
172+
<span class="p">]:</span> <span class="c1"># pragma: no cover</span>
169173
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Process a user message (text or audio) and optional images, returning the response stream.</span>
170174

171175
<span class="sd"> Args:</span>
@@ -179,6 +183,7 @@ <h1>Source code for solana_agent.client.solana_agent</h1><div class="highlight">
179183
<span class="sd"> vad: Whether to use voice activity detection (for audio input)</span>
180184
<span class="sd"> rt_encode_input: Whether to re-encode input audio for compatibility</span>
181185
<span class="sd"> rt_encode_output: Whether to re-encode output audio for compatibility</span>
186+
<span class="sd"> rt_output_modalities: Modalities to return in realtime (default both if None)</span>
182187
<span class="sd"> rt_voice: Voice to use for realtime audio output</span>
183188
<span class="sd"> audio_voice: Voice to use for audio output</span>
184189
<span class="sd"> audio_output_format: Audio output format</span>
@@ -199,6 +204,7 @@ <h1>Source code for solana_agent.client.solana_agent</h1><div class="highlight">
199204
<span class="n">vad</span><span class="o">=</span><span class="n">vad</span><span class="p">,</span>
200205
<span class="n">rt_encode_input</span><span class="o">=</span><span class="n">rt_encode_input</span><span class="p">,</span>
201206
<span class="n">rt_encode_output</span><span class="o">=</span><span class="n">rt_encode_output</span><span class="p">,</span>
207+
<span class="n">rt_output_modalities</span><span class="o">=</span><span class="n">rt_output_modalities</span><span class="p">,</span>
202208
<span class="n">rt_voice</span><span class="o">=</span><span class="n">rt_voice</span><span class="p">,</span>
203209
<span class="n">audio_voice</span><span class="o">=</span><span class="n">audio_voice</span><span class="p">,</span>
204210
<span class="n">audio_output_format</span><span class="o">=</span><span class="n">audio_output_format</span><span class="p">,</span>

_sources/index.rst.txt

Lines changed: 104 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,9 +223,10 @@ This example will work using expo-audio on Android and iOS.
223223
rt_encode_input=True,
224224
rt_encode_output=True,
225225
rt_voice="marin",
226+
rt_output_modalities=["audio"],
226227
output_format="audio",
227-
audio_output_format="mp3",
228228
audio_input_format="m4a",
229+
audio_output_format="mp3",
229230
):
230231
yield chunk
231232
@@ -240,6 +241,108 @@ This example will work using expo-audio on Android and iOS.
240241
},
241242
)
242243
244+
Realtime Text Streaming
245+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
246+
247+
Due to the overhead of the router (API call) - realtime only supports a single agent setup.
248+
249+
Realtime uses MongoDB for memory so Zep is not needed.
250+
251+
.. code-block:: python
252+
253+
from solana_agent import SolanaAgent
254+
255+
solana_agent = SolanaAgent(config=config)
256+
257+
async def generate():
258+
async for chunk in solana_agent.process(
259+
user_id="user123",
260+
message="What is the latest news on Solana?",
261+
realtime=True,
262+
rt_output_modalities=["text"],
263+
):
264+
yield chunk
265+
266+
Dual Modality Realtime Streaming
267+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
268+
269+
Solana Agent now supports **dual modality realtime streaming**, allowing you to stream both audio and text simultaneously from a single realtime session. This enables rich conversational experiences where users can receive both voice responses and text transcripts in real-time.
270+
271+
Features
272+
^^^^^^^^
273+
274+
- **Simultaneous Audio & Text**: Stream both modalities from the same conversation
275+
- **Flexible Output**: Choose audio-only, text-only, or both modalities
276+
- **Real-time Demuxing**: Automatically separate audio and text streams
277+
- **Mobile Optimized**: Works seamlessly with compressed audio formats (MP4/MP3)
278+
- **Memory Efficient**: Smart buffering and streaming for optimal performance
279+
280+
Mobile App Integration Example
281+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
282+
283+
.. code-block:: python
284+
285+
from fastapi import UploadFile
286+
from fastapi.responses import StreamingResponse
287+
from solana_agent import SolanaAgent
288+
from solana_agent.interfaces.providers.realtime import RealtimeChunk
289+
import base64
290+
291+
solana_agent = SolanaAgent(config=config)
292+
293+
@app.post("/realtime/dual")
294+
async def realtime_dual_endpoint(audio_file: UploadFile):
295+
"""
296+
Dual modality (audio + text) realtime endpoint using Server-Sent Events (SSE).
297+
Emits:
298+
event: audio (base64 encoded audio frames)
299+
event: transcript (incremental text)
300+
Notes:
301+
- Do NOT set output_format when using both modalities.
302+
- If only one modality is requested, plain str (text) or raw audio bytes may be yielded instead of RealtimeChunk.
303+
"""
304+
audio_content = await audio_file.read()
305+
306+
async def event_stream():
307+
async for chunk in solana_agent.process(
308+
user_id="mobile_user",
309+
message=audio_content,
310+
realtime=True,
311+
rt_encode_input=True,
312+
rt_encode_output=True,
313+
rt_output_modalities=["audio", "text"],
314+
rt_voice="marin",
315+
audio_input_format="mp4",
316+
audio_output_format="mp3",
317+
# Optionally lock transcription model (otherwise default is auto-selected):
318+
# rt_transcription_model="gpt-4o-mini-transcribe",
319+
):
320+
if isinstance(chunk, RealtimeChunk):
321+
if chunk.is_audio and chunk.audio_data:
322+
b64 = base64.b64encode(chunk.audio_data).decode("ascii")
323+
yield f"event: audio\ndata: {b64}\n\n"
324+
elif chunk.is_text and chunk.text_data:
325+
# Incremental transcript (not duplicated at finalize)
326+
yield f"event: transcript\ndata: {chunk.text_data}\n\n"
327+
continue
328+
# (Defensive) fallback: if something else appears
329+
if isinstance(chunk, bytes):
330+
b64 = base64.b64encode(chunk).decode("ascii")
331+
yield f"event: audio\ndata: {b64}\n\n"
332+
elif isinstance(chunk, str):
333+
yield f"event: transcript\ndata: {chunk}\n\n"
334+
335+
yield "event: done\ndata: end\n\n"
336+
337+
return StreamingResponse(
338+
event_stream(),
339+
media_type="text/event-stream",
340+
headers={
341+
"Cache-Control": "no-store",
342+
"Access-Control-Allow-Origin": "*",
343+
},
344+
)
345+
243346
Image/Text Streaming
244347
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
245348

0 commit comments

Comments
 (0)