Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 28 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -281,27 +281,39 @@ Due to the overhead of the router (API call) - realtime only supports a single a

Realtime uses MongoDB for memory so Zep is not needed.

This example will work using expo-audio on Android and iOS.

```python
from solana_agent import SolanaAgent

solana_agent = SolanaAgent(config=config)

# Example: mobile sends MP4/AAC; server encodes output to AAC
audio_content = await audio_file.read() # bytes
async for audio_chunk in solana_agent.process(
"user123", # required
audio_content, # required
realtime=True, # optional (default False)
output_format="audio", # required
vad=True, # enable VAD (optional)
rt_encode_input=True, # accept compressed input (optional)
rt_encode_output=True, # encode output for client (optional)
rt_voice="marin" # the voice to use for interactions (optional)
audio_input_format="mp4", # client transport (optional)
audio_output_format="aac" # client transport (optional)
):
handle_audio(audio_chunk)
```
audio_content = await audio_file.read()

async def generate():
async for chunk in solana_agent.process(
user_id=user_id,
message=audio_content,
realtime=True,
rt_encode_input=True,
rt_encode_output=True,
rt_voice="marin",
output_format="audio",
audio_output_format="m4a",
audio_input_format="mp4",
):
yield chunk

return StreamingResponse(
content=generate(),
media_type="audio/mp4",
headers={
"Cache-Control": "no-store",
"Pragma": "no-cache",
"Content-Disposition": "inline; filename=stream.m4a",
"X-Accel-Buffering": "no",
},
)

### Image/Text Streaming

Expand Down
43 changes: 28 additions & 15 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -205,27 +205,40 @@ Due to the overhead of the router (API call) - realtime only supports a single a

Realtime uses MongoDB for memory so Zep is not needed.

This example will work using expo-audio on Android and iOS.

.. code-block:: python

from solana_agent import SolanaAgent

solana_agent = SolanaAgent(config=config)

# Example: mobile sends MP4/AAC; server encodes output to AAC
audio_content = await audio_file.read() # bytes
async for audio_chunk in solana_agent.process(
"user123", # required
audio_content, # required
realtime=True, # optional (default False)
output_format="audio", # required
vad=True, # enable VAD (optional)
rt_encode_input=True, # accept compressed input (optional)
rt_encode_output=True, # encode output for client (optional)
rt_voice="marin" # the voice to use for interactions (optional)
audio_input_format="mp4", # client transport (optional)
audio_output_format="aac" # client transport (optional)
):
handle_audio(audio_chunk)
audio_content = await audio_file.read()

async def generate():
async for chunk in solana_agent.process(
user_id=user_id,
message=audio_content,
realtime=True,
rt_encode_input=True,
rt_encode_output=True,
rt_voice="marin",
output_format="audio",
audio_output_format="m4a",
audio_input_format="mp4",
):
yield chunk

return StreamingResponse(
content=generate(),
media_type="audio/mp4",
headers={
"Cache-Control": "no-store",
"Pragma": "no-cache",
"Content-Disposition": "inline; filename=stream.m4a",
"X-Accel-Buffering": "no",
},
)

Image/Text Streaming
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "solana-agent"
version = "31.2.2"
version = "31.2.3"
description = "AI Agents for Solana"
authors = ["Bevan Hunt <bevan@bevanhunt.com>"]
license = "MIT"
Expand Down
65 changes: 61 additions & 4 deletions solana_agent/adapters/ffmpeg_transcoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,14 @@ async def to_pcm16( # pragma: no cover
async def from_pcm16( # pragma: no cover
self, pcm16_bytes: bytes, output_mime: str, rate_hz: int
) -> bytes:
"""Encode PCM16LE to desired format (currently AAC ADTS for mobile streaming)."""
"""Encode PCM16LE to desired format (AAC ADTS, fragmented MP4, or MP3)."""
logger.info(
"Encode from PCM16: output_mime=%s, rate_hz=%d, input_len=%d",
output_mime,
rate_hz,
len(pcm16_bytes),
)

if output_mime in ("audio/mpeg", "audio/mp3"):
# Encode to MP3 (often better streaming compatibility on mobile)
args = [
Expand Down Expand Up @@ -122,8 +123,9 @@ async def from_pcm16( # pragma: no cover
"Encoded from PCM16 to %s: output_len=%d", output_mime, len(out)
)
return out
if output_mime in ("audio/aac", "audio/mp4", "audio/m4a"):
# Encode to AAC in ADTS stream; clients can play it as AAC.

if output_mime in ("audio/aac",):
# Encode to AAC in ADTS stream; good for streaming over sockets/HTTP chunked
args = [
"-hide_banner",
"-loglevel",
Expand All @@ -149,6 +151,38 @@ async def from_pcm16( # pragma: no cover
"Encoded from PCM16 to %s: output_len=%d", output_mime, len(out)
)
return out

if output_mime in ("audio/mp4", "audio/m4a"):
# Encode to fragmented MP4 (fMP4) with AAC for better iOS compatibility
# For streaming, write an initial moov and fragment over stdout.
args = [
"-hide_banner",
"-loglevel",
"error",
"-f",
"s16le",
"-ac",
"1",
"-ar",
str(rate_hz),
"-i",
"pipe:0",
"-c:a",
"aac",
"-b:a",
"96k",
"-movflags",
"+frag_keyframe+empty_moov",
"-f",
"mp4",
"pipe:1",
]
out = await self._run_ffmpeg(args, pcm16_bytes)
logger.info(
"Encoded from PCM16 to %s (fMP4): output_len=%d", output_mime, len(out)
)
return out

# Default: passthrough
logger.info("Encode passthrough (no change), output_len=%d", len(pcm16_bytes))
return pcm16_bytes
Expand Down Expand Up @@ -187,7 +221,7 @@ async def stream_from_pcm16( # pragma: no cover
"mp3",
"pipe:1",
]
elif output_mime in ("audio/aac", "audio/mp4", "audio/m4a"):
elif output_mime in ("audio/aac",):
args = [
"-hide_banner",
"-loglevel",
Expand All @@ -208,6 +242,29 @@ async def stream_from_pcm16( # pragma: no cover
"adts",
"pipe:1",
]
elif output_mime in ("audio/mp4", "audio/m4a"):
args = [
"-hide_banner",
"-loglevel",
"error",
"-f",
"s16le",
"-ac",
"1",
"-ar",
str(rate_hz),
"-i",
"pipe:0",
"-c:a",
"aac",
"-b:a",
"96k",
"-movflags",
"+frag_keyframe+empty_moov",
"-f",
"mp4",
"pipe:1",
]
else:
# Passthrough streaming: just yield input
async for chunk in pcm_iter:
Expand Down
40 changes: 38 additions & 2 deletions solana_agent/adapters/openai_realtime_ws.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,26 @@ async def _recv_loop(self) -> None: # pragma: no cover
try:
chunk = base64.b64decode(b64)
self._audio_queue.put_nowait(chunk)
logger.info("Audio delta bytes=%d", len(chunk))
# Ownership/response tagging for diagnostics
try:
owner = getattr(self, "_owner_user_id", None)
except Exception:
owner = None
try:
rid = getattr(self, "_active_response_id", None)
except Exception:
rid = None
try:
gen = int(getattr(self, "_response_generation", 0))
except Exception:
gen = None
logger.info(
"Audio delta bytes=%d owner=%s rid=%s gen=%s",
len(chunk),
owner,
rid,
gen,
)
try:
# New response detected if we were previously inactive
if not getattr(self, "_response_active", False):
Expand Down Expand Up @@ -492,8 +511,25 @@ async def _recv_loop(self) -> None: # pragma: no cover
"response.audio.done",
):
# End of audio stream for the response; stop audio iterator but keep WS open for transcripts
try:
owner = getattr(self, "_owner_user_id", None)
except Exception:
owner = None
try:
rid = (data.get("response") or {}).get("id") or getattr(
self, "_active_response_id", None
)
except Exception:
rid = None
try:
gen = int(getattr(self, "_response_generation", 0))
except Exception:
gen = None
logger.info(
"Realtime WS: output audio done; ending audio stream"
"Realtime WS: output audio done; owner=%s rid=%s gen=%s",
owner,
rid,
gen,
)
# If we have a buffered transcript for this response, flush it now
try:
Expand Down
Loading