From c22661d7b444f6e2e25d6f913796b6b6272dcca9 Mon Sep 17 00:00:00 2001 From: Alireza Kenarsari Date: Tue, 28 May 2024 10:56:35 -0700 Subject: [PATCH] llm voice assistant in python --- README.md | 7 +- recipes/.gitkeep | 0 recipes/llm-voice-assistant/README.md | 14 + recipes/llm-voice-assistant/python/README.md | 74 ++++ recipes/llm-voice-assistant/python/main.py | 378 ++++++++++++++++++ .../python/requirements.txt | 7 + res/.lint/spell-check/dict.txt | 16 +- 7 files changed, 494 insertions(+), 2 deletions(-) delete mode 100644 recipes/.gitkeep create mode 100644 recipes/llm-voice-assistant/README.md create mode 100644 recipes/llm-voice-assistant/python/README.md create mode 100644 recipes/llm-voice-assistant/python/main.py create mode 100644 recipes/llm-voice-assistant/python/requirements.txt diff --git a/README.md b/README.md index a8c2de5..149db16 100644 --- a/README.md +++ b/README.md @@ -1 +1,6 @@ -# pico-cookbook \ No newline at end of file +# Pico Cookbook + +Made in Vancouver, Canada by [Picovoice](https://picovoice.ai) + +[![Twitter URL](https://img.shields.io/twitter/url?label=%40AiPicovoice&style=social&url=https%3A%2F%2Ftwitter.com%2FAiPicovoice)](https://twitter.com/AiPicovoice) +[![YouTube Channel Views](https://img.shields.io/youtube/channel/views/UCAdi9sTCXLosG1XeqDwLx7w?label=YouTube&style=social)](https://www.youtube.com/channel/UCAdi9sTCXLosG1XeqDwLx7w) diff --git a/recipes/.gitkeep b/recipes/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/recipes/llm-voice-assistant/README.md b/recipes/llm-voice-assistant/README.md new file mode 100644 index 0000000..958d7ac --- /dev/null +++ b/recipes/llm-voice-assistant/README.md @@ -0,0 +1,14 @@ +# LLM-Powered Voice Assistant + +Hands-free voice assistant powered by a large language model (LLM), all voice recognition, LLM inference, and speech synthesis are on-device. + +## Components + +- [Porcupine Wake Word](https://picovoice.ai/docs/porcupine/) +- [Cheetah Streaming Speech-to-Text](https://picovoice.ai/docs/cheetah/) +- [picoLLM Inference Engine](https://github.com/Picovoice/picollm) +- [Orca Streaming Text-to-Speech](https://picovoice.ai/docs/orca/) + +## Implementations + +- [Python](python) diff --git a/recipes/llm-voice-assistant/python/README.md b/recipes/llm-voice-assistant/python/README.md new file mode 100644 index 0000000..d8bb6de --- /dev/null +++ b/recipes/llm-voice-assistant/python/README.md @@ -0,0 +1,74 @@ +## Compatibility + +- Python 3.8+ +- Runs on Linux (x86_64), macOS (arm64, x86_64), Windows (x86_64), and Raspberry Pi (5 and 4). + +## AccessKey + +AccessKey is your authentication and authorization token for deploying Picovoice SDKs, including picoLLM. Anyone who is +using Picovoice needs to have a valid AccessKey. You must keep your AccessKey secret. You would need internet +connectivity to validate your AccessKey with Picovoice license servers even though the LLM inference is running 100% +offline and completely free for open-weight models. Everyone who signs up for +[Picovoice Console](https://console.picovoice.ai/) receives a unique AccessKey. + +## picoLLM Model + +picoLLM Inference Engine supports many open-weight models. The models are on +[Picovoice Console](https://console.picovoice.ai/). + +## Usage + +Install the required packages: + +```console +pip install -r requirements.txt +``` + +Run the demo: + +```console +python3 main.py --access_key ${ACCESS_KEY} --picollm_model_path ${PICOLLM_MODEL_PATH} +``` + +Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${PICOLLM_MODEL_PATH}` with the path to the +model downloaded from Picovoice Console. + +To see all available options, type the following: + +```console +python main.py --help +``` + +## Custom Wake Word + +The demo's default wake phrase is `Picovoice`. You can generate your custom (branded) wake word using Picovoice Console by following [Porcupine Wake Word documentation (https://picovoice.ai/docs/porcupine/). Once you have the model trained, simply pass it to the demo +application using `--keyword_model_path` argument. + +## Profiling + +To see the runtime profiling metrics, run the demo with the `--profile` argument: + +```console +python3 main.py --access_key ${ACCESS_KEY} --picollm_model_path ${PICOLLM_MODEL_PATH} --profile +``` + +Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${PICOLLM_MODEL_PATH}` with the path to the +model downloaded from Picovoice Console. + +The demo profiles three metrics: Real-time Factor (RTF), Token per Second (TPS), and Latency. + +### Real-time Factor (RTF) + +RTF is a standard metric for measuring the speed of speech processing (e.g., wake word, speech-to-text, and +text-to-speech). RTF is the CPU time divided by the processed (recognized or synthesized) audio length. Hence, a lower RTF means a more efficient engine. + +### Token per Second (PPS) + +Token per second is the standard metric for measuring the speed of LLM inference engines. TPS is the number of +generated tokens divided by the compute time used to create them. A higher TPS is better. + +### Latency + +We measure the latency as the delay between the end of the user's utterance (i.e., the time when the user finishes talking) and the +time that the voice assistant generates the first chunk of the audio response (i.e., when the user starts hearing the response). + diff --git a/recipes/llm-voice-assistant/python/main.py b/recipes/llm-voice-assistant/python/main.py new file mode 100644 index 0000000..a10fdeb --- /dev/null +++ b/recipes/llm-voice-assistant/python/main.py @@ -0,0 +1,378 @@ +import signal +import time +from argparse import ArgumentParser +from multiprocessing import ( + Pipe, + Process, +) +from typing import ( + Optional, + Sequence, +) + +import picollm +import pvcheetah +import pvorca +import pvporcupine +from pvrecorder import PvRecorder + + +class RTFProfiler: + def __init__(self, sample_rate: int) -> None: + self._sample_rate = sample_rate + self._compute_sec = 0. + self._audio_sec = 0. + self._tick_sec = 0. + + def tick(self) -> None: + self._tick_sec = time.perf_counter() + + def tock(self, audio: Optional[Sequence[int]] = None) -> None: + self._compute_sec += time.perf_counter() - self._tick_sec + self._audio_sec += (len(audio) / self._sample_rate) if audio is not None else 0. + + def rtf(self) -> float: + rtf = self._compute_sec / self._audio_sec + self._compute_sec = 0. + self._audio_sec = 0. + return rtf + + +class TPSProfiler(object): + def __init__(self) -> None: + self._num_tokens = 0 + self._start_sec = 0. + + def tock(self) -> None: + if self._start_sec == 0.: + self._start_sec = time.perf_counter() + else: + self._num_tokens += 1 + + def tps(self) -> float: + tps = self._num_tokens / (time.perf_counter() - self._start_sec) + self._num_tokens = 0 + self._start_sec = 0. + return tps + + +def orca_worker(access_key: str, connection, warmup_sec: float, stream_frame_sec: int = 0.03) -> None: + # noinspection PyUnresolvedReferences + import numpy as np + from sounddevice import OutputStream + + orca = pvorca.create(access_key=access_key) + orca_stream = orca.stream_open() + + texts = list() + pcm_buffer = list() + warmup = [False] + synthesize = False + flush = False + close = False + utterance_end_sec = 0. + delay_sec = [-1.] + + def callback(data, _, __, ___) -> None: + if warmup[0]: + if len(pcm_buffer) < int(warmup_sec * orca.sample_rate): + data[:, 0] = 0 + return + else: + warmup[0] = False + + if len(pcm_buffer) < data.shape[0]: + pcm_buffer.extend([0] * (data.shape[0] - len(pcm_buffer))) + + data[:, 0] = pcm_buffer[:data.shape[0]] + del pcm_buffer[:data.shape[0]] + + stream = OutputStream( + samplerate=orca.sample_rate, + blocksize=int(stream_frame_sec * orca.sample_rate), + channels=1, + dtype='int16', + callback=callback) + + connection.send({'version': orca.version}) + + orca_profiler = RTFProfiler(orca.sample_rate) + + def buffer_pcm(x: Optional[Sequence[int]]) -> None: + if x is not None: + pcm_buffer.extend(x) + if delay_sec[0] == -1: + delay_sec[0] = time.perf_counter() - utterance_end_sec + + while True: + if synthesize and len(texts) > 0: + orca_profiler.tick() + pcm = orca_stream.synthesize(texts.pop(0)) + orca_profiler.tock(pcm) + buffer_pcm(pcm) + elif flush: + while len(texts) > 0: + orca_profiler.tick() + pcm = orca_stream.synthesize(texts.pop(0)) + orca_profiler.tock(pcm) + buffer_pcm(pcm) + orca_profiler.tick() + pcm = orca_stream.flush() + orca_profiler.tock(pcm) + buffer_pcm(pcm) + connection.send({'rtf': orca_profiler.rtf(), 'delay': delay_sec[0]}) + flush = False + while len(pcm_buffer) > 0: + time.sleep(stream_frame_sec) + stream.stop() + delay_sec[0] = -1 + connection.send({'done': True}) + elif close: + break + else: + time.sleep(stream_frame_sec) + + while connection.poll(): + message = connection.recv() + if message['command'] == 'synthesize': + texts.append(message['text']) + if not stream.active: + stream.start() + warmup[0] = True + utterance_end_sec = message['utterance_end_sec'] + synthesize = True + elif message['command'] == 'flush': + synthesize = False + flush = True + elif message['command'] == 'close': + close = True + + stream.close() + orca_stream.close() + orca.delete() + + +def main() -> None: + parser = ArgumentParser() + parser.add_argument( + '--access_key', + required=True, + help='`AccessKey` obtained from `Picovoice Console` (https://console.picovoice.ai/).') + parser.add_argument( + '--picollm_model_path', + required=True, + help='Absolute path to the file containing LLM parameters (`.pllm`).') + parser.add_argument( + '--keyword-model_path', + help='Absolute path to the keyword model file (`.ppn`). If not set, `Picovoice` will be the wake phrase') + parser.add_argument( + '--cheetah_endpoint_duration_sec', + type=float, + default=1., + help="Duration of silence (pause) after the user's utterance to consider it the end of the utterance.") + parser.add_argument( + '--picollm_device', + help="String representation of the device (e.g., CPU or GPU) to use for inference. If set to `best`, picoLLM " + "picks the most suitable device. If set to `gpu`, the engine uses the first available GPU device. To " + "select a specific GPU device, set this argument to `gpu:${GPU_INDEX}`, where `${GPU_INDEX}` is the index " + "of the target GPU. If set to `cpu`, the engine will run on the CPU with the default number of threads. " + "To specify the number of threads, set this argument to `cpu:${NUM_THREADS}`, where `${NUM_THREADS}` is " + "the desired number of threads.") + parser.add_argument( + '--picollm_completion_token_limit', + type=int, + default=256, + help="Maximum number of tokens in the completion. Set to `None` to impose no limit.") + parser.add_argument( + '--picollm_presence_penalty', + type=float, + default=0., + help="It penalizes logits already appearing in the partial completion if set to a positive value. If set to " + "`0.0`, it has no effect.") + parser.add_argument( + '--picollm_frequency_penalty', + type=float, + default=0., + help="If set to a positive floating-point value, it penalizes logits proportional to the frequency of their " + "appearance in the partial completion. If set to `0.0`, it has no effect.") + parser.add_argument( + '--picollm_temperature', + type=float, + default=0., + help="Sampling temperature. Temperature is a non-negative floating-point value that controls the randomness of " + "the sampler. A higher temperature smoothens the samplers' output, increasing the randomness. In " + "contrast, a lower temperature creates a narrower distribution and reduces variability. Setting it to " + "`0` selects the maximum logit during sampling.") + parser.add_argument( + '--picollm_top_p', + type=float, + default=1., + help="A positive floating-point number within (0, 1]. It restricts the sampler's choices to high-probability " + "logits that form the `top_p` portion of the probability mass. Hence, it avoids randomly selecting " + "unlikely logits. A value of `1.` enables the sampler to pick any token with non-zero probability, " + "turning off the feature.") + parser.add_argument( + '--orca_warmup_sec', + type=float, + default=0., + help="Duration of the synthesized audio to buffer before streaming it out. A higher value helps slower " + "(e.g., Raspberry Pi) to keep up with real-time at the cost of increasing the initial delay.") + parser.add_argument('--profile', action='store_true', help='Show runtime profiling information.') + parser.add_argument('--short_answers', action='store_true') + args = parser.parse_args() + + access_key = args.access_key + picollm_model_path = args.picollm_model_path + keyword_model_path = args.keyword_model_path + cheetah_endpoint_duration_sec = args.cheetah_endpoint_duration_sec + picollm_device = args.picollm_device + picollm_completion_token_limit = args.picollm_completion_token_limit + picollm_presence_penalty = args.picollm_presence_penalty + picollm_frequency_penalty = args.picollm_frequency_penalty + picollm_temperature = args.picollm_temperature + picollm_top_p = args.picollm_top_p + orca_warmup_sec = args.orca_warmup_sec + profile = args.profile + short_answers = args.short_answers + + if keyword_model_path is None: + porcupine = pvporcupine.create(access_key=access_key, keywords=['picovoice']) + else: + porcupine = pvporcupine.create(access_key=access_key, keyword_paths=[keyword_model_path]) + print(f"→ Porcupine V{porcupine.version}") + + cheetah = pvcheetah.create( + access_key=access_key, + endpoint_duration_sec=cheetah_endpoint_duration_sec, + enable_automatic_punctuation=True) + print(f"→ Cheetah V{cheetah.version}") + + pllm = picollm.create(access_key=access_key, model_path=picollm_model_path, device=picollm_device) + dialog = pllm.get_dialog() + print(f"→ picoLLM V{pllm.version} <{pllm.model}>") + + main_connection, orca_process_connection = Pipe() + orca_process = Process(target=orca_worker, args=(access_key, orca_process_connection, orca_warmup_sec)) + orca_process.start() + while not main_connection.poll(): + time.sleep(0.01) + print(f"→ Orca V{main_connection.recv()['version']}") + + mic = PvRecorder(frame_length=porcupine.frame_length) + mic.start() + + print(f"\n$ Say {'`Picovoice`' if keyword_model_path is None else 'the wake word'} ...") + + stop = [False] + + def handler(_, __) -> None: + stop[0] = True + + signal.signal(signal.SIGINT, handler) + + wake_word_detected = False + user_request = '' + endpoint_reached = False + utterance_end_sec = 0 + + porcupine_profiler = RTFProfiler(porcupine.sample_rate) + cheetah_profiler = RTFProfiler(cheetah.sample_rate) + + try: + while True: + if stop[0]: + break + elif not wake_word_detected: + pcm = mic.read() + porcupine_profiler.tick() + wake_word_detected = porcupine.process(pcm) == 0 + porcupine_profiler.tock(pcm) + if wake_word_detected: + if profile: + print(f"[Porcupine RTF: {porcupine_profiler.rtf():.3f}]") + print("$ Wake word detected, utter your request or question ...\n") + print("User > ", end='', flush=True) + elif not endpoint_reached: + pcm = mic.read() + cheetah_profiler.tick() + partial_transcript, endpoint_reached = cheetah.process(pcm) + cheetah_profiler.tock(pcm) + print(partial_transcript, end='', flush=True) + user_request += partial_transcript + if endpoint_reached: + utterance_end_sec = time.perf_counter() + cheetah_profiler.tick() + remaining_transcript = cheetah.flush() + cheetah_profiler.tock() + user_request += remaining_transcript + print(remaining_transcript, end='\n\n') + if profile: + print(f"[Cheetah RTF: {cheetah_profiler.rtf():.3f}]") + else: + short_answers_instruction = \ + "You are a voice assistant and your answers are very short but informative" + dialog.add_human_request( + f"{short_answers_instruction}. {user_request}" if short_answers else user_request) + + picollm_profiler = TPSProfiler() + + stop_phrases = { + '', # Llama-2, Mistral, and Mixtral + '', # Gemma + '<|endoftext|>', # Phi-2 + '<|eot_id|>', # Llama-3 + } + + completion = [''] + + def llm_callback(text: str) -> None: + picollm_profiler.tock() + completion[0] += text + if not any(x in completion[0] for x in stop_phrases): + main_connection.send({ + 'command': 'synthesize', + 'text': text.replace('\n', ' . '), + 'utterance_end_sec': utterance_end_sec}) + print(text, end='', flush=True) + + print("\nLLM > ", end='', flush=True) + res = pllm.generate( + prompt=dialog.prompt(), + completion_token_limit=picollm_completion_token_limit, + stop_phrases=stop_phrases, + presence_penalty=picollm_presence_penalty, + frequency_penalty=picollm_frequency_penalty, + temperature=picollm_temperature, + top_p=picollm_top_p, + stream_callback=llm_callback) + main_connection.send({'command': 'flush'}) + print('\n') + dialog.add_llm_response(res.completion) + if profile: + print(f"[picoLLM TPS: {picollm_profiler.tps():.2f}]") + + while not main_connection.poll(): + time.sleep(0.01) + message = main_connection.recv() + if profile: + print(f"[Orca RTF: {message['rtf']:.2f}]") + print(f"[Delay: {message['delay']:.2f} sec]") + while not main_connection.poll(): + time.sleep(0.01) + assert main_connection.recv()['done'] + + wake_word_detected = False + user_request = '' + endpoint_reached = False + print(f"\n$ Say {'`Picovoice`' if keyword_model_path is None else 'the wake word'} ...") + finally: + main_connection.send({'command': 'close'}) + mic.delete() + pllm.release() + cheetah.delete() + porcupine.delete() + orca_process.join() + + +if __name__ == '__main__': + main() diff --git a/recipes/llm-voice-assistant/python/requirements.txt b/recipes/llm-voice-assistant/python/requirements.txt new file mode 100644 index 0000000..f911465 --- /dev/null +++ b/recipes/llm-voice-assistant/python/requirements.txt @@ -0,0 +1,7 @@ +numpy +picollm==1.0.0 +pvcheetah==2.0.1 +pvorca==0.2.2 +pvporcupine==3.0.2 +pvrecorder==1.2.2 +sounddevice diff --git a/res/.lint/spell-check/dict.txt b/res/.lint/spell-check/dict.txt index f400b3f..1893aea 100644 --- a/res/.lint/spell-check/dict.txt +++ b/res/.lint/spell-check/dict.txt @@ -1,2 +1,16 @@ +dtype +endoftext +logit +mixtral +numpy pico -picovoice \ No newline at end of file +picollm +picovoice +pllm +pvcheetah +pvorca +pvporcupine +pvrecorder +samplerate +sounddevice +tock \ No newline at end of file