(fix): websocket api respects output format (elevenlabs#321)

fern-support · web-flow · commit 7af17bce7abc · 2024-07-18T08:02:11.000-04:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "elevenlabs"
-version = "v1.4.1"
+version = "v1.5.0"
 description = ""
 readme = "README.md"
 authors = []
diff --git a/src/elevenlabs/client.py b/src/elevenlabs/client.py
@@ -14,6 +14,7 @@
   PronunciationDictionaryVersionLocator, Model
 from .environment import ElevenLabsEnvironment
 from .realtime_tts import RealtimeTextToSpeechClient
+from .types import OutputFormat
 
 
 DEFAULT_VOICE = Voice(
@@ -124,7 +125,7 @@ def generate(
       model: Union[ModelId, Model] = "eleven_monolingual_v1",
       optimize_streaming_latency: typing.Optional[int] = 0,
       stream: bool = False,
-      output_format: Optional[str] = "mp3_44100_128",
+      output_format: Optional[OutputFormat] = "mp3_44100_128",
       pronunciation_dictionary_locators: typing.Optional[
             typing.Sequence[PronunciationDictionaryVersionLocator]
         ] = OMIT,
@@ -152,7 +153,7 @@ def generate(
 
                             Defaults to False.                                                                
 
-            - output_format: typing.Optional[str]. Output format of the generated audio. Must be one of:
+            - output_format: typing.Optional[OutputFormat]. Output format of the generated audio. Must be one of:
                                                    mp3_22050_32 - output format, mp3 with 22.05kHz sample rate at 32kbps.
                                                    mp3_44100_32 - output format, mp3 with 44.1kHz sample rate at 32kbps.
                                                    mp3_44100_64 - output format, mp3 with 44.1kHz sample rate at 64kbps.
@@ -303,7 +304,7 @@ async def generate(
       model: Union[ModelId, Model] = "eleven_monolingual_v1",
       optimize_streaming_latency: typing.Optional[int] = 0,
       stream: bool = False,
-      output_format: Optional[str] = "mp3_44100_128",
+      output_format: Optional[OutputFormat] = "mp3_44100_128",
       pronunciation_dictionary_locators: typing.Optional[
             typing.Sequence[PronunciationDictionaryVersionLocator]
         ] = OMIT,
@@ -338,7 +339,7 @@ async def generate(
 
                             Defaults to False.                                                                
 
-            - output_format: typing.Optional[str]. Output format of the generated audio. Must be one of:
+            - output_format: typing.Optional[OutputFormat]. Output format of the generated audio. Must be one of:
                                                    mp3_22050_32 - output format, mp3 with 22.05kHz sample rate at 32kbps.
                                                    mp3_44100_32 - output format, mp3 with 44.1kHz sample rate at 32kbps.
                                                    mp3_44100_64 - output format, mp3 with 44.1kHz sample rate at 64kbps.
diff --git a/src/elevenlabs/realtime_tts.py b/src/elevenlabs/realtime_tts.py
@@ -14,6 +14,7 @@
 from .core.request_options import RequestOptions
 from .types.voice_settings import VoiceSettings
 from .text_to_speech.client import TextToSpeechClient
+from .types import OutputFormat
 
 # this is used as the default value for optional parameters
 OMIT = typing.cast(typing.Any, ...)
@@ -45,6 +46,7 @@ def convert_realtime(
         *,
         text: typing.Iterator[str],
         model_id: typing.Optional[str] = OMIT,
+        output_format: typing.Optional[OutputFormat] = "mp3_44100_128",
         voice_settings: typing.Optional[VoiceSettings] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.Iterator[bytes]:
@@ -86,7 +88,8 @@ def get_text() -> typing.Iterator[str]:
         """
         with connect(
             urllib.parse.urljoin(
-              "wss://api.elevenlabs.io/", f"v1/text-to-speech/{jsonable_encoder(voice_id)}/stream-input?model_id={model_id}"
+              "wss://api.elevenlabs.io/", 
+              f"v1/text-to-speech/{jsonable_encoder(voice_id)}/stream-input?model_id={model_id}&output_format={output_format}"
             ),
             additional_headers=jsonable_encoder(
                 remove_none_from_dict(