Skip to content

Commit 39a6c0e

Browse files
fix realtime sessions and ios encoding (#131)
* update * wip * done and working on ios and android
1 parent 9a7243f commit 39a6c0e

File tree

5 files changed

+57
-12
lines changed

5 files changed

+57
-12
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -299,18 +299,18 @@ async def generate():
299299
rt_encode_output=True,
300300
rt_voice="marin",
301301
output_format="audio",
302-
audio_output_format="m4a",
302+
audio_output_format="mp3",
303303
audio_input_format="mp4",
304304
):
305305
yield chunk
306306

307307
return StreamingResponse(
308308
content=generate(),
309-
media_type="audio/mp4",
309+
media_type="audio/mp3",
310310
headers={
311311
"Cache-Control": "no-store",
312312
"Pragma": "no-cache",
313-
"Content-Disposition": "inline; filename=stream.m4a",
313+
"Content-Disposition": "inline; filename=stream.mp3",
314314
"X-Accel-Buffering": "no",
315315
},
316316
)

docs/index.rst

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -224,18 +224,18 @@ This example will work using expo-audio on Android and iOS.
224224
rt_encode_output=True,
225225
rt_voice="marin",
226226
output_format="audio",
227-
audio_output_format="m4a",
228-
audio_input_format="mp4",
227+
audio_output_format="mp3",
228+
audio_input_format="m4a",
229229
):
230230
yield chunk
231231
232232
return StreamingResponse(
233233
content=generate(),
234-
media_type="audio/mp4",
234+
media_type="audio/mp3",
235235
headers={
236236
"Cache-Control": "no-store",
237237
"Pragma": "no-cache",
238-
"Content-Disposition": "inline; filename=stream.m4a",
238+
"Content-Disposition": "inline; filename=stream.mp3",
239239
"X-Accel-Buffering": "no",
240240
},
241241
)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "solana-agent"
3-
version = "31.2.3"
3+
version = "31.2.4"
44
description = "AI Agents for Solana"
55
authors = ["Bevan Hunt <bevan@bevanhunt.com>"]
66
license = "MIT"

solana_agent/adapters/ffmpeg_transcoder.py

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import contextlib
55
import logging
66
from typing import List, AsyncGenerator
7+
import tempfile
8+
import os
79

810
from solana_agent.interfaces.providers.audio import AudioTranscoder
911

@@ -49,11 +51,45 @@ async def to_pcm16( # pragma: no cover
4951
rate_hz,
5052
len(audio_bytes),
5153
)
52-
# Prefer to hint format for common containers/codecs; ffmpeg can still autodetect if hint is wrong.
53-
hinted_format = None
54+
# iOS-recorded MP4/M4A often requires a seekable input for reliable demuxing.
55+
# Decode from a temporary file instead of stdin for MP4/M4A.
5456
if input_mime in ("audio/mp4", "audio/m4a"):
55-
hinted_format = "mp4"
56-
elif input_mime in ("audio/aac",):
57+
suffix = ".m4a" if input_mime == "audio/m4a" else ".mp4"
58+
tmp_path = None
59+
try:
60+
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tf:
61+
tmp_path = tf.name
62+
tf.write(audio_bytes)
63+
args = [
64+
"-hide_banner",
65+
"-loglevel",
66+
"error",
67+
"-i",
68+
tmp_path,
69+
"-vn", # ignore any video tracks
70+
"-acodec",
71+
"pcm_s16le",
72+
"-ac",
73+
"1",
74+
"-ar",
75+
str(rate_hz),
76+
"-f",
77+
"s16le",
78+
"pipe:1",
79+
]
80+
out = await self._run_ffmpeg(args, b"")
81+
logger.info(
82+
"Transcoded (MP4/M4A temp-file) to PCM16: output_len=%d", len(out)
83+
)
84+
return out
85+
finally:
86+
if tmp_path:
87+
with contextlib.suppress(Exception):
88+
os.remove(tmp_path)
89+
90+
# For other formats, prefer a format hint when helpful and decode from stdin.
91+
hinted_format = None
92+
if input_mime in ("audio/aac",):
5793
# Raw AAC is typically in ADTS stream format
5894
hinted_format = "adts"
5995
elif input_mime in ("audio/ogg", "audio/webm"):

solana_agent/adapters/openai_realtime_ws.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,6 +1037,15 @@ def _strip_tool_strict(tools_val):
10371037
if "tools" in patch:
10381038
patch["tools"] = _strip_tool_strict(patch["tools"]) # idempotent
10391039

1040+
# Per server requirements, always include session.type and output_modalities
1041+
try:
1042+
patch["type"] = "realtime"
1043+
# Preserve caller-provided output_modalities if present, otherwise default to audio
1044+
if "output_modalities" not in patch:
1045+
patch["output_modalities"] = ["audio"]
1046+
except Exception:
1047+
pass
1048+
10401049
payload = {"type": "session.update", "session": patch}
10411050
# Mark awaiting updated and store last patch
10421051
self._last_session_patch = patch or {}

0 commit comments

Comments
 (0)