OpenInterpreter · KillianLucas · Jun 21, 2024 · May 3, 2024 · May 3, 2024 · May 3, 2024
diff --git a/README.md b/README.md
@@ -127,7 +127,9 @@ If you want to run local speech-to-text using Whisper, you must install Rust. Fo
 
 ## Customizations
 
-To customize the behavior of the system, edit the [system message, model, skills library path,](https://docs.openinterpreter.com/settings/all-settings) etc. in `i.py`. This file sets up an interpreter, and is powered by Open Interpreter.
+To customize the behavior of the system, edit the [system message, model, skills library path,](https://docs.openinterpreter.com/settings/all-settings) etc. in the `profiles` directory under the `server` directory. This file sets up an interpreter, and is powered by Open Interpreter.
+
+To specify the text-to-speech service for the 01 `base_device.py`, set `interpreter.tts` to either "openai" for OpenAI, "elevenlabs" for ElevenLabs, or "coqui" for Coqui (local) in a profile. For the 01 Light, set `SPEAKER_SAMPLE_RATE` to 24000 for Coqui (local) or 22050 for OpenAI TTS. We currently don't support ElevenLabs TTS on the 01 Light.
 
 ## Ubuntu Dependencies
 

diff --git a/software/poetry.lock b/software/poetry.lock
diff --git a/software/pyproject.toml b/software/pyproject.toml
@@ -28,13 +28,27 @@ psutil = "^5.9.8"
 typer = "^0.9.0"
 platformdirs = "^4.2.0"
 rich = "^13.7.1"
-open-interpreter = {extras = ["os"], version = "^0.2.5"}
-dateparser = "^1.2.0"
 pytimeparse = "^1.1.8"
 python-crontab = "^3.0.0"
 inquirer = "^3.2.4"
 pyqrcode = "^1.2.1"
+realtimestt = "^0.1.12"
+realtimetts = "^0.4.1"
+keyboard = "^0.13.5"
+pyautogui = "^0.9.54"
+ctranslate2 = "4.1.0"
+py3-tts = "^3.5"
+elevenlabs = "1.2.2"
+groq = "^0.5.0"
+open-interpreter = {extras = ["os"], version = "^0.2.6"}
+litellm = "1.35.35"
+openai = "1.30.5"
+pywebview = "*"
+pyobjc = "*"
 
+sentry-sdk = "^2.4.0"
+plyer = "^2.1.0"
+pywinctl = "^0.3"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"

diff --git a/software/pytest.ini b/software/pytest.ini
@@ -1,5 +1,6 @@
 ; Config for Pytest Runner.
 ; suppress Deprecation Warning and User Warning to not spam the interface, but check periodically
+
 [pytest]
 python_files = tests.py test_*.py
 filterwarnings =

diff --git a/software/source/clients/base_device.py b/software/source/clients/base_device.py
@@ -2,6 +2,7 @@
 
 load_dotenv()  # take environment variables from .env.
 
+import subprocess
 import os
 import sys
 import asyncio
@@ -46,7 +47,7 @@
 CHUNK = 1024  # Record in chunks of 1024 samples
 FORMAT = pyaudio.paInt16  # 16 bits per sample
 CHANNELS = 1  # Mono
-RATE = 44100  # Sample rate
+RATE = 16000  # Sample rate
 RECORDING = False  # Flag to control recording state
 SPACEBAR_PRESSED = False  # Flag to track spacebar press state
 
@@ -60,12 +61,18 @@
 # Specify OS
 current_platform = get_system_info()
 
+
 def is_win11():
     return sys.getwindowsversion().build >= 22000
 
+
 def is_win10():
     try:
-        return platform.system() == "Windows" and "10" in platform.version() and not is_win11()
+        return (
+            platform.system() == "Windows"
+            and "10" in platform.version()
+            and not is_win11()
+        )
     except:
         return False
 
@@ -80,9 +87,10 @@ class Device:
     def __init__(self):
         self.pressed_keys = set()
         self.captured_images = []
-        self.audiosegments = []
+        self.audiosegments = asyncio.Queue()
         self.server_url = ""
         self.ctrl_pressed = False
+        self.tts_service = ""
 
     def fetch_image_from_camera(self, camera_index=CAMERA_DEVICE_INDEX):
         """Captures an image from the specified camera device and saves it to a temporary file. Adds the image to the captured_images list."""
@@ -144,11 +152,25 @@ def queue_all_captured_images(self):
 
     async def play_audiosegments(self):
         """Plays them sequentially."""
+
+        mpv_command = ["mpv", "--no-cache", "--no-terminal", "--", "fd://0"]
+        mpv_process = subprocess.Popen(
+            mpv_command,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+
         while True:
             try:
-                for audio in self.audiosegments:
+                audio = await self.audiosegments.get()
+
+                if self.tts_service == "elevenlabs":
+                    mpv_process.stdin.write(audio)  # type: ignore
+                    mpv_process.stdin.flush()  # type: ignore
+                else:
                     play(audio)
-                    self.audiosegments.remove(audio)
+
                 await asyncio.sleep(0.1)
             except asyncio.exceptions.CancelledError:
                 # This happens once at the start?
@@ -267,19 +289,18 @@ def toggle_recording(self, state):
     def on_press(self, key):
         """Detect spacebar press and Ctrl+C combination."""
         self.pressed_keys.add(key)  # Add the pressed key to the set
-
 
         if keyboard.Key.space in self.pressed_keys:
             self.toggle_recording(True)
-        elif {keyboard.Key.ctrl, keyboard.KeyCode.from_char('c')} <= self.pressed_keys:
+        elif {keyboard.Key.ctrl, keyboard.KeyCode.from_char("c")} <= self.pressed_keys:
             logger.info("Ctrl+C pressed. Exiting...")
             kill_process_tree()
             os._exit(0)
-        
+
         # Windows alternative to the above
         if key == keyboard.Key.ctrl_l:
             self.ctrl_pressed = True
-            
+
         try:
             if key.vk == 67 and self.ctrl_pressed:
                 logger.info("Ctrl+C pressed. Exiting...")
@@ -289,17 +310,17 @@ def on_press(self, key):
         except:
             pass
 
-
-
     def on_release(self, key):
         """Detect spacebar release and 'c' key press for camera, and handle key release."""
-        self.pressed_keys.discard(key)  # Remove the released key from the key press tracking set
+        self.pressed_keys.discard(
+            key
+        )  # Remove the released key from the key press tracking set
 
         if key == keyboard.Key.ctrl_l:
             self.ctrl_pressed = False
         if key == keyboard.Key.space:
             self.toggle_recording(False)
-        elif CAMERA_ENABLED and key == keyboard.KeyCode.from_char('c'):
+        elif CAMERA_ENABLED and key == keyboard.KeyCode.from_char("c"):
             self.fetch_image_from_camera()
 
     async def message_sender(self, websocket):
@@ -332,35 +353,48 @@ async def exec_ws_communication(websocket):
                 chunk = await websocket.recv()
 
                 logger.debug(f"Got this message from the server: {type(chunk)} {chunk}")
+                # print("received chunk from server")
 
                 if type(chunk) == str:
                     chunk = json.loads(chunk)
 
-                message = accumulator.accumulate(chunk)
+                    if chunk.get("type") == "config":
+                        self.tts_service = chunk.get("tts_service")
+                        continue
+
+                if self.tts_service == "elevenlabs":
+                    message = chunk
+                else:
+                    message = accumulator.accumulate(chunk)
+
                 if message == None:
                     # Will be None until we have a full message ready
                     continue
 
                 # At this point, we have our message
-
-                if message["type"] == "audio" and message["format"].startswith("bytes"):
+                if isinstance(message, bytes) or (
+                    message["type"] == "audio" and message["format"].startswith("bytes")
+                ):
                     # Convert bytes to audio file
-
-                    audio_bytes = message["content"]
-
-                    # Create an AudioSegment instance with the raw data
-                    audio = AudioSegment(
-                        # raw audio data (bytes)
-                        data=audio_bytes,
-                        # signed 16-bit little-endian format
-                        sample_width=2,
-                        # 16,000 Hz frame rate
-                        frame_rate=16000,
-                        # mono sound
-                        channels=1,
-                    )
-
-                    self.audiosegments.append(audio)
+                    if self.tts_service == "elevenlabs":
+                        audio_bytes = message
+                        audio = audio_bytes
+                    else:
+                        audio_bytes = message["content"]
+
+                        # Create an AudioSegment instance with the raw data
+                        audio = AudioSegment(
+                            # raw audio data (bytes)
+                            data=audio_bytes,
+                            # signed 16-bit little-endian format
+                            sample_width=2,
+                            # 16,000 Hz frame rate
+                            frame_rate=22050,
+                            # mono sound
+                            channels=1,
+                        )
+
+                    await self.audiosegments.put(audio)
 
                 # Run the code if that's the client's job
                 if os.getenv("CODE_RUNNER") == "client":
@@ -369,7 +403,7 @@ async def exec_ws_communication(websocket):
                         code = message["content"]
                         result = interpreter.computer.run(language, code)
                         send_queue.put(result)
-                        
+
         if is_win10():
             logger.info("Windows 10 detected")
             # Workaround for Windows 10 not latching to the websocket server.
@@ -399,6 +433,7 @@ async def start_async(self):
 
         # Start watching the kernel if it's your job to do that
         if os.getenv("CODE_RUNNER") == "client":
+            # client is not running code!
             asyncio.create_task(put_kernel_messages_into_queue(send_queue))
 
         asyncio.create_task(self.play_audiosegments())