SDK Regeneration

elevenlabs · Mar 12, 2024 · 81925c1 · 81925c1
1 parent a3eaeee
commit 81925c1
Show file tree

Hide file tree

Showing 4 changed files with 168 additions and 156 deletions.
diff --git a/.fernignore b/.fernignore
@@ -2,7 +2,7 @@
 
 src/elevenlabs/client.py
 src/elevenlabs/play.py
-src/elevenlabs/tts.py
+src/elevenlabs/realtime_tts.py
 
 .github/workflows/ci.yml
 

diff --git a/src/elevenlabs/client.py b/src/elevenlabs/client.py
@@ -1,6 +1,8 @@
 import typing
 import json
 import re
+import os
+import httpx
 
 from typing import Iterator, Optional, Union, \
   Optional, AsyncIterator
@@ -10,6 +12,8 @@
 from .core import RequestOptions, ApiError
 from .types import Voice, VoiceSettings, \
   PronunciationDictionaryVersionLocator, Model
+from .environment import ElevenLabsEnvironment
+from .realitme_tts import RealtimeTextToSpeechClient
 
 
 DEFAULT_VOICE = Voice(
@@ -58,6 +62,23 @@ class ElevenLabs(BaseElevenLabs):
         api_key="YOUR_API_KEY",
     )
     """
+    def __init__(
+        self,
+        *,
+        base_url: typing.Optional[str] = None,
+        environment: ElevenLabsEnvironment = ElevenLabsEnvironment.PRODUCTION,
+        api_key: typing.Optional[str] = os.getenv("ELEVEN_API_KEY"),
+        timeout: typing.Optional[float] = 60,
+        httpx_client: typing.Optional[httpx.Client] = None
+    ):
+        super().__init__(
+            base_url=base_url,
+            environment=environment,
+            api_key=api_key,
+            timeout=timeout,
+            httpx_client=httpx_client
+        )
+        self.text_to_speech = RealtimeTextToSpeechClient(client_wrapper=self._client_wrapper)
 
     def clone(
       self,
@@ -187,16 +208,12 @@ def generate(
                     model_id=model_id
                 )
             elif isinstance(text, Iterator):
-                # TODO(fern): Update to WebSocket
-                return self.text_to_speech.convert_as_stream(
+                return self.text_to_speech.convert_realtime(  # type: ignore
                     voice_id=voice_id,
-                    model_id=model_id,
                     voice_settings=voice_settings,
-                    optimize_streaming_latency=optimize_streaming_latency,
-                    output_format=output_format,
                     text=text,
                     request_options=request_options,
-                    pronunciation_dictionary_locators=pronunciation_dictionary_locators
+                    model_id=model_id
                 )
             else: 
                 raise ApiError(body="Text is neither a string nor an iterator.")
@@ -280,7 +297,7 @@ async def clone(
     async def generate(
       self,
       *,
-      text: Union[str, Iterator[str]],
+      text: str,
       voice: Union[VoiceId, VoiceName, Voice] = DEFAULT_VOICE,
       voice_settings: typing.Optional[VoiceSettings] = DEFAULT_VOICE.settings,
       model: Union[ModelId, Model] = "eleven_monolingual_v1",
@@ -300,7 +317,7 @@ async def generate(
           calls to the `text_to_speech.convert` and`text_to_speech.convert_as_stream`
           functions.
 
-            - text: Union[str, Iterator[str]]. The string or stream of strings that will get converted into speech.
+            - text: str. The string that will get converted into speech. The Async client does not support streaming.
 
             - voice: str. A voice id, name, or voice response. Defaults to the Rachel voice. 
 
@@ -363,31 +380,16 @@ async def generate(
             model_id = model.model_id
 
         if stream:
-            if isinstance(text, str):
-                return self.text_to_speech.convert_as_stream(
-                    voice_id=voice_id,
-                    model_id=model_id,
-                    voice_settings=voice_settings,
-                    optimize_streaming_latency=optimize_streaming_latency,
-                    output_format=output_format,
-                    text=text,
-                    request_options=request_options,
-                    pronunciation_dictionary_locators=pronunciation_dictionary_locators
-                )
-            elif isinstance(text, Iterator):
-                # TODO(fern): Update to WebSocket
-                return self.text_to_speech.convert_as_stream(
-                    voice_id=voice_id,
-                    model_id=model_id,
-                    voice_settings=voice_settings,
-                    optimize_streaming_latency=optimize_streaming_latency,
-                    output_format=output_format,
-                    text=text,
-                    request_options=request_options,
-                    pronunciation_dictionary_locators=pronunciation_dictionary_locators
-                )
-            else:
-                raise ApiError(body="Text is neither a string nor an iterator.")
+            return self.text_to_speech.convert_as_stream(
+                voice_id=voice_id,
+                model_id=model_id,
+                voice_settings=voice_settings,
+                optimize_streaming_latency=optimize_streaming_latency,
+                output_format=output_format,
+                text=text,
+                request_options=request_options,
+                pronunciation_dictionary_locators=pronunciation_dictionary_locators
+            )
         else:
             if not isinstance(text, str):
                 raise ApiError(body="Text must be a string when stream is False.")

diff --git a/src/elevenlabs/realitme_tts.py b/src/elevenlabs/realitme_tts.py
@@ -0,0 +1,132 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+import urllib.parse
+import json
+import base64
+import websockets
+
+from websockets.sync.client import connect
+
+from .core.api_error import ApiError
+from .core.jsonable_encoder import jsonable_encoder
+from .core.remove_none_from_dict import remove_none_from_dict
+from .core.request_options import RequestOptions
+from .types.voice_settings import VoiceSettings
+from .text_to_speech.client import TextToSpeechClient
+
+# this is used as the default value for optional parameters
+OMIT = typing.cast(typing.Any, ...)
+
+
+def text_chunker(chunks: typing.Iterator[str]) -> typing.Iterator[str]:
+    """Used during input streaming to chunk text blocks and set last char to space"""
+    splitters = (".", ",", "?", "!", ";", ":", "—", "-", "(", ")", "[", "]", "}", " ")
+    buffer = ""
+    for text in chunks:
+        if buffer.endswith(splitters):
+            yield buffer if buffer.endswith(" ") else buffer + " "
+            buffer = text
+        elif text.startswith(splitters):
+            output = buffer + text[0]
+            yield output if output.endswith(" ") else output + " "
+            buffer = text[1:]
+        else:
+            buffer += text
+    if buffer != "":
+        yield buffer + " "
+
+
+class RealtimeTextToSpeechClient(TextToSpeechClient):
+
+    def convert_realtime(
+        self,
+        voice_id: str,
+        *,
+        text: typing.Iterator[str],
+        model_id: typing.Optional[str] = OMIT,
+        voice_settings: typing.Optional[VoiceSettings] = OMIT,
+        request_options: typing.Optional[RequestOptions] = None,
+    ) -> typing.Iterator[bytes]:
+        """
+        Converts text into speech using a voice of your choice and returns audio.
+
+        Parameters:
+            - voice_id: str. Voice ID to be used, you can use https://api.elevenlabs.io/v1/voices to list all the available voices.
+            
+            - text: typing.Iterator[str]. The text that will get converted into speech.
+
+            - model_id: typing.Optional[str]. Identifier of the model that will be used, you can query them using GET /v1/models. The model needs to have support for text to speech, you can check this using the can_do_text_to_speech property.
+
+            - voice_settings: typing.Optional[VoiceSettings]. Voice settings overriding stored setttings for the given voice. They are applied only on the given request.
+
+            - request_options: typing.Optional[RequestOptions]. Request-specific configuration.
+        ---
+        from elevenlabs import PronunciationDictionaryVersionLocator, VoiceSettings
+        from elevenlabs.client import ElevenLabs
+
+        def get_text() -> typing.Iterator[str]:
+            yield "Hello, how are you?"
+            yield "I am fine, thank you."
+
+        client = ElevenLabs(
+            api_key="YOUR_API_KEY",
+        )
+        client.text_to_speech.convert_realtime(
+            voice_id="string",
+            text=get_text(),
+            model_id="string",
+            voice_settings=VoiceSettings(
+                stability=1.1,
+                similarity_boost=1.1,
+                style=1.1,
+                use_speaker_boost=True,
+            ),
+        )
+        """
+        with connect(
+            urllib.parse.urljoin(
+              "wss://api.elevenlabs.io/", f"v1/text-to-speech/{jsonable_encoder(voice_id)}/stream-input?model_id={model_id}"
+            ),
+            additional_headers=jsonable_encoder(
+                remove_none_from_dict(
+                    {
+                        **self._client_wrapper.get_headers(),
+                        **(request_options.get("additional_headers", {}) if request_options is not None else {}),
+                    }
+                )
+            )
+        ) as socket:
+            socket.send(json.dumps(
+                dict(
+                    text=" ",
+                    try_trigger_generation=True,
+                    voice_settings=voice_settings.dict() if voice_settings else None,
+                    generation_config=dict(
+                        chunk_length_schedule=[50],
+                    ),
+                )
+            ))
+
+            for text_chunk in text_chunker(text):
+                data = dict(text=text_chunk, try_trigger_generation=True)
+                socket.send(json.dumps(data))
+                try:
+                    data = json.loads(socket.recv(1e-4))
+                    if "audio" in data and data["audio"]:
+                        yield base64.b64decode(data["audio"])  # type: ignore
+                except TimeoutError:
+                    pass
+
+            socket.send(json.dumps(dict(text="")))
+
+            while True:
+                try:
+                    data = json.loads(socket.recv())                   
+                    if "audio" in data and data["audio"]:
+                        yield base64.b64decode(data["audio"])  # type: ignore
+                except websockets.exceptions.ConnectionClosed:
+                    if "message" in data:
+                        raise ApiError(body=data)
+                    break
+
diff --git a/src/elevenlabs/tts.py b/src/elevenlabs/tts.py