From 384b3ba20805c7c10dff2ef1bfaedf0dfcaeaa23 Mon Sep 17 00:00:00 2001 From: Albert Ho <82767499+albho@users.noreply.github.com> Date: Tue, 13 Aug 2024 13:22:25 -0700 Subject: [PATCH] pvspeaker for Python llm-voice-assistant --- recipes/llm-voice-assistant/python/main.py | 67 +++++++++---------- .../python/requirements.txt | 3 +- res/.lint/spell-check/dict.txt | 3 +- 3 files changed, 36 insertions(+), 37 deletions(-) diff --git a/recipes/llm-voice-assistant/python/main.py b/recipes/llm-voice-assistant/python/main.py index a10fdeb..6e75722 100644 --- a/recipes/llm-voice-assistant/python/main.py +++ b/recipes/llm-voice-assistant/python/main.py @@ -1,6 +1,8 @@ import signal import time from argparse import ArgumentParser +from collections import deque +from itertools import chain from multiprocessing import ( Pipe, Process, @@ -15,6 +17,7 @@ import pvorca import pvporcupine from pvrecorder import PvRecorder +from pvspeaker import PvSpeaker class RTFProfiler: @@ -57,15 +60,11 @@ def tps(self) -> float: def orca_worker(access_key: str, connection, warmup_sec: float, stream_frame_sec: int = 0.03) -> None: - # noinspection PyUnresolvedReferences - import numpy as np - from sounddevice import OutputStream - orca = pvorca.create(access_key=access_key) orca_stream = orca.stream_open() texts = list() - pcm_buffer = list() + pcm_deque = deque() warmup = [False] synthesize = False flush = False @@ -73,58 +72,58 @@ def orca_worker(access_key: str, connection, warmup_sec: float, stream_frame_sec utterance_end_sec = 0. delay_sec = [-1.] - def callback(data, _, __, ___) -> None: - if warmup[0]: - if len(pcm_buffer) < int(warmup_sec * orca.sample_rate): - data[:, 0] = 0 - return - else: - warmup[0] = False - - if len(pcm_buffer) < data.shape[0]: - pcm_buffer.extend([0] * (data.shape[0] - len(pcm_buffer))) - - data[:, 0] = pcm_buffer[:data.shape[0]] - del pcm_buffer[:data.shape[0]] - - stream = OutputStream( - samplerate=orca.sample_rate, - blocksize=int(stream_frame_sec * orca.sample_rate), - channels=1, - dtype='int16', - callback=callback) + speaker = PvSpeaker(sample_rate=orca.sample_rate, bits_per_sample=16, buffer_size_secs=20) connection.send({'version': orca.version}) orca_profiler = RTFProfiler(orca.sample_rate) - def buffer_pcm(x: Optional[Sequence[int]]) -> None: - if x is not None: - pcm_buffer.extend(x) + def buffer_pcm(pcm_chunk: Optional[Sequence[int]]) -> None: + if pcm_chunk is not None: if delay_sec[0] == -1: delay_sec[0] = time.perf_counter() - utterance_end_sec + pcm_deque.append(pcm_chunk) + + def play_buffered_pcm() -> None: + if warmup[0]: + if len(list(chain.from_iterable(pcm_deque))) < int(warmup_sec * orca.sample_rate): + return + else: + warmup[0] = False + + if len(pcm_deque) > 0: + pcm_chunk = list(chain.from_iterable(pcm_deque)) + pcm_deque.clear() + + written = speaker.write(pcm_chunk) + if written < len(pcm_chunk): + pcm_deque.appendleft(pcm_chunk[written:]) + while True: if synthesize and len(texts) > 0: orca_profiler.tick() pcm = orca_stream.synthesize(texts.pop(0)) orca_profiler.tock(pcm) buffer_pcm(pcm) + play_buffered_pcm() elif flush: while len(texts) > 0: orca_profiler.tick() pcm = orca_stream.synthesize(texts.pop(0)) orca_profiler.tock(pcm) buffer_pcm(pcm) + play_buffered_pcm() orca_profiler.tick() pcm = orca_stream.flush() orca_profiler.tock(pcm) buffer_pcm(pcm) + play_buffered_pcm() connection.send({'rtf': orca_profiler.rtf(), 'delay': delay_sec[0]}) flush = False - while len(pcm_buffer) > 0: - time.sleep(stream_frame_sec) - stream.stop() + speaker.flush(list(chain.from_iterable(pcm_deque))) + pcm_deque.clear() + speaker.stop() delay_sec[0] = -1 connection.send({'done': True}) elif close: @@ -136,8 +135,8 @@ def buffer_pcm(x: Optional[Sequence[int]]) -> None: message = connection.recv() if message['command'] == 'synthesize': texts.append(message['text']) - if not stream.active: - stream.start() + if not speaker.is_started: + speaker.start() warmup[0] = True utterance_end_sec = message['utterance_end_sec'] synthesize = True @@ -147,7 +146,7 @@ def buffer_pcm(x: Optional[Sequence[int]]) -> None: elif message['command'] == 'close': close = True - stream.close() + speaker.delete() orca_stream.close() orca.delete() diff --git a/recipes/llm-voice-assistant/python/requirements.txt b/recipes/llm-voice-assistant/python/requirements.txt index cf6ff89..9e0b63f 100644 --- a/recipes/llm-voice-assistant/python/requirements.txt +++ b/recipes/llm-voice-assistant/python/requirements.txt @@ -1,7 +1,6 @@ -numpy picollm==1.0.0 pvcheetah==2.0.1 pvorca==0.2.3 pvporcupine==3.0.2 pvrecorder==1.2.2 -sounddevice +pvspeaker==1.0.2 diff --git a/res/.lint/spell-check/dict.txt b/res/.lint/spell-check/dict.txt index c55e387..69de399 100644 --- a/res/.lint/spell-check/dict.txt +++ b/res/.lint/spell-check/dict.txt @@ -1,3 +1,4 @@ +appendleft Colour Compat dotdotdot @@ -17,9 +18,9 @@ pvcheetah pvorca pvporcupine pvrecorder +pvspeaker samplerate sdcard -sounddevice Spannable tock tock