From c22661d7b444f6e2e25d6f913796b6b6272dcca9 Mon Sep 17 00:00:00 2001
From: Alireza Kenarsari <alireza@picovoice.ai>
Date: Tue, 28 May 2024 10:56:35 -0700
Subject: [PATCH] llm voice assistant in python

---
 README.md                                     |   7 +-
 recipes/.gitkeep                              |   0
 recipes/llm-voice-assistant/README.md         |  14 +
 recipes/llm-voice-assistant/python/README.md  |  74 ++++
 recipes/llm-voice-assistant/python/main.py    | 378 ++++++++++++++++++
 .../python/requirements.txt                   |   7 +
 res/.lint/spell-check/dict.txt                |  16 +-
 7 files changed, 494 insertions(+), 2 deletions(-)
 delete mode 100644 recipes/.gitkeep
 create mode 100644 recipes/llm-voice-assistant/README.md
 create mode 100644 recipes/llm-voice-assistant/python/README.md
 create mode 100644 recipes/llm-voice-assistant/python/main.py
 create mode 100644 recipes/llm-voice-assistant/python/requirements.txt

diff --git a/README.md b/README.md
index a8c2de5..149db16 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,6 @@
-# pico-cookbook
\ No newline at end of file
+# Pico Cookbook
+
+Made in Vancouver, Canada by [Picovoice](https://picovoice.ai)
+
+[![Twitter URL](https://img.shields.io/twitter/url?label=%40AiPicovoice&style=social&url=https%3A%2F%2Ftwitter.com%2FAiPicovoice)](https://twitter.com/AiPicovoice)<!-- markdown-link-check-disable-line -->
+[![YouTube Channel Views](https://img.shields.io/youtube/channel/views/UCAdi9sTCXLosG1XeqDwLx7w?label=YouTube&style=social)](https://www.youtube.com/channel/UCAdi9sTCXLosG1XeqDwLx7w)
diff --git a/recipes/.gitkeep b/recipes/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/recipes/llm-voice-assistant/README.md b/recipes/llm-voice-assistant/README.md
new file mode 100644
index 0000000..958d7ac
--- /dev/null
+++ b/recipes/llm-voice-assistant/README.md
@@ -0,0 +1,14 @@
+# LLM-Powered Voice Assistant
+
+Hands-free voice assistant powered by a large language model (LLM), all voice recognition, LLM inference, and speech synthesis are on-device.
+
+## Components
+
+- [Porcupine Wake Word](https://picovoice.ai/docs/porcupine/)
+- [Cheetah Streaming Speech-to-Text](https://picovoice.ai/docs/cheetah/)
+- [picoLLM Inference Engine](https://github.com/Picovoice/picollm)
+- [Orca Streaming Text-to-Speech](https://picovoice.ai/docs/orca/)
+
+## Implementations
+
+- [Python](python)
diff --git a/recipes/llm-voice-assistant/python/README.md b/recipes/llm-voice-assistant/python/README.md
new file mode 100644
index 0000000..d8bb6de
--- /dev/null
+++ b/recipes/llm-voice-assistant/python/README.md
@@ -0,0 +1,74 @@
+## Compatibility
+
+- Python 3.8+
+- Runs on Linux (x86_64), macOS (arm64, x86_64), Windows (x86_64), and Raspberry Pi (5 and 4).
+
+## AccessKey
+
+AccessKey is your authentication and authorization token for deploying Picovoice SDKs, including picoLLM. Anyone who is
+using Picovoice needs to have a valid AccessKey. You must keep your AccessKey secret. You would need internet
+connectivity to validate your AccessKey with Picovoice license servers even though the LLM inference is running 100%
+offline and completely free for open-weight models. Everyone who signs up for
+[Picovoice Console](https://console.picovoice.ai/) receives a unique AccessKey.
+
+## picoLLM Model
+
+picoLLM Inference Engine supports many open-weight models. The models are on
+[Picovoice Console](https://console.picovoice.ai/).
+
+## Usage
+
+Install the required packages:
+
+```console
+pip install -r requirements.txt
+```
+
+Run the demo:
+
+```console
+python3 main.py --access_key ${ACCESS_KEY} --picollm_model_path ${PICOLLM_MODEL_PATH} 
+```
+
+Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${PICOLLM_MODEL_PATH}` with the path to the 
+model downloaded from Picovoice Console.
+
+To see all available options, type the following:
+
+```console
+python main.py --help
+```
+
+## Custom Wake Word
+
+The demo's default wake phrase is `Picovoice`. You can generate your custom (branded) wake word using Picovoice  Console by following [Porcupine Wake Word documentation (https://picovoice.ai/docs/porcupine/). Once you have the model trained, simply pass it to the demo
+application using `--keyword_model_path` argument.
+
+## Profiling
+
+To see the runtime profiling metrics, run the demo with the `--profile` argument:
+
+```console
+python3 main.py --access_key ${ACCESS_KEY} --picollm_model_path ${PICOLLM_MODEL_PATH} --profile 
+```
+
+Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${PICOLLM_MODEL_PATH}` with the path to the 
+model downloaded from Picovoice Console.
+
+The demo profiles three metrics: Real-time Factor (RTF), Token per Second (TPS), and Latency.
+
+### Real-time Factor (RTF)
+
+RTF is a standard metric for measuring the speed of speech processing (e.g., wake word, speech-to-text, and 
+text-to-speech). RTF is the CPU time divided by the processed (recognized or synthesized) audio length. Hence, a lower RTF means a more efficient engine.
+
+### Token per Second (PPS)
+
+Token per second is the standard metric for measuring the speed of LLM inference engines. TPS is the number of 
+generated tokens divided by the compute time used to create them. A higher TPS is better.
+
+### Latency
+
+We measure the latency as the delay between the end of the user's utterance (i.e., the time when the user finishes talking) and the 
+time that the voice assistant generates the first chunk of the audio response (i.e., when the user starts hearing the response).
+
diff --git a/recipes/llm-voice-assistant/python/main.py b/recipes/llm-voice-assistant/python/main.py
new file mode 100644
index 0000000..a10fdeb
--- /dev/null
+++ b/recipes/llm-voice-assistant/python/main.py
@@ -0,0 +1,378 @@
+import signal
+import time
+from argparse import ArgumentParser
+from multiprocessing import (
+    Pipe,
+    Process,
+)
+from typing import (
+    Optional,
+    Sequence,
+)
+
+import picollm
+import pvcheetah
+import pvorca
+import pvporcupine
+from pvrecorder import PvRecorder
+
+
+class RTFProfiler:
+    def __init__(self, sample_rate: int) -> None:
+        self._sample_rate = sample_rate
+        self._compute_sec = 0.
+        self._audio_sec = 0.
+        self._tick_sec = 0.
+
+    def tick(self) -> None:
+        self._tick_sec = time.perf_counter()
+
+    def tock(self, audio: Optional[Sequence[int]] = None) -> None:
+        self._compute_sec += time.perf_counter() - self._tick_sec
+        self._audio_sec += (len(audio) / self._sample_rate) if audio is not None else 0.
+
+    def rtf(self) -> float:
+        rtf = self._compute_sec / self._audio_sec
+        self._compute_sec = 0.
+        self._audio_sec = 0.
+        return rtf
+
+
+class TPSProfiler(object):
+    def __init__(self) -> None:
+        self._num_tokens = 0
+        self._start_sec = 0.
+
+    def tock(self) -> None:
+        if self._start_sec == 0.:
+            self._start_sec = time.perf_counter()
+        else:
+            self._num_tokens += 1
+
+    def tps(self) -> float:
+        tps = self._num_tokens / (time.perf_counter() - self._start_sec)
+        self._num_tokens = 0
+        self._start_sec = 0.
+        return tps
+
+
+def orca_worker(access_key: str, connection, warmup_sec: float, stream_frame_sec: int = 0.03) -> None:
+    # noinspection PyUnresolvedReferences
+    import numpy as np
+    from sounddevice import OutputStream
+
+    orca = pvorca.create(access_key=access_key)
+    orca_stream = orca.stream_open()
+
+    texts = list()
+    pcm_buffer = list()
+    warmup = [False]
+    synthesize = False
+    flush = False
+    close = False
+    utterance_end_sec = 0.
+    delay_sec = [-1.]
+
+    def callback(data, _, __, ___) -> None:
+        if warmup[0]:
+            if len(pcm_buffer) < int(warmup_sec * orca.sample_rate):
+                data[:, 0] = 0
+                return
+            else:
+                warmup[0] = False
+
+        if len(pcm_buffer) < data.shape[0]:
+            pcm_buffer.extend([0] * (data.shape[0] - len(pcm_buffer)))
+
+        data[:, 0] = pcm_buffer[:data.shape[0]]
+        del pcm_buffer[:data.shape[0]]
+
+    stream = OutputStream(
+        samplerate=orca.sample_rate,
+        blocksize=int(stream_frame_sec * orca.sample_rate),
+        channels=1,
+        dtype='int16',
+        callback=callback)
+
+    connection.send({'version': orca.version})
+
+    orca_profiler = RTFProfiler(orca.sample_rate)
+
+    def buffer_pcm(x: Optional[Sequence[int]]) -> None:
+        if x is not None:
+            pcm_buffer.extend(x)
+            if delay_sec[0] == -1:
+                delay_sec[0] = time.perf_counter() - utterance_end_sec
+
+    while True:
+        if synthesize and len(texts) > 0:
+            orca_profiler.tick()
+            pcm = orca_stream.synthesize(texts.pop(0))
+            orca_profiler.tock(pcm)
+            buffer_pcm(pcm)
+        elif flush:
+            while len(texts) > 0:
+                orca_profiler.tick()
+                pcm = orca_stream.synthesize(texts.pop(0))
+                orca_profiler.tock(pcm)
+                buffer_pcm(pcm)
+            orca_profiler.tick()
+            pcm = orca_stream.flush()
+            orca_profiler.tock(pcm)
+            buffer_pcm(pcm)
+            connection.send({'rtf': orca_profiler.rtf(), 'delay': delay_sec[0]})
+            flush = False
+            while len(pcm_buffer) > 0:
+                time.sleep(stream_frame_sec)
+            stream.stop()
+            delay_sec[0] = -1
+            connection.send({'done': True})
+        elif close:
+            break
+        else:
+            time.sleep(stream_frame_sec)
+
+        while connection.poll():
+            message = connection.recv()
+            if message['command'] == 'synthesize':
+                texts.append(message['text'])
+                if not stream.active:
+                    stream.start()
+                    warmup[0] = True
+                utterance_end_sec = message['utterance_end_sec']
+                synthesize = True
+            elif message['command'] == 'flush':
+                synthesize = False
+                flush = True
+            elif message['command'] == 'close':
+                close = True
+
+    stream.close()
+    orca_stream.close()
+    orca.delete()
+
+
+def main() -> None:
+    parser = ArgumentParser()
+    parser.add_argument(
+        '--access_key',
+        required=True,
+        help='`AccessKey` obtained from `Picovoice Console` (https://console.picovoice.ai/).')
+    parser.add_argument(
+        '--picollm_model_path',
+        required=True,
+        help='Absolute path to the file containing LLM parameters (`.pllm`).')
+    parser.add_argument(
+        '--keyword-model_path',
+        help='Absolute path to the keyword model file (`.ppn`). If not set, `Picovoice` will be the wake phrase')
+    parser.add_argument(
+        '--cheetah_endpoint_duration_sec',
+        type=float,
+        default=1.,
+        help="Duration of silence (pause) after the user's utterance to consider it the end of the utterance.")
+    parser.add_argument(
+        '--picollm_device',
+        help="String representation of the device (e.g., CPU or GPU) to use for inference. If set to `best`, picoLLM "
+             "picks the most suitable device. If set to `gpu`, the engine uses the first available GPU device. To "
+             "select a specific GPU device, set this argument to `gpu:${GPU_INDEX}`, where `${GPU_INDEX}` is the index "
+             "of the target GPU. If set to `cpu`, the engine will run on the CPU with the default number of threads. "
+             "To specify the number of threads, set this argument to `cpu:${NUM_THREADS}`, where `${NUM_THREADS}` is "
+             "the desired number of threads.")
+    parser.add_argument(
+        '--picollm_completion_token_limit',
+        type=int,
+        default=256,
+        help="Maximum number of tokens in the completion. Set to `None` to impose no limit.")
+    parser.add_argument(
+        '--picollm_presence_penalty',
+        type=float,
+        default=0.,
+        help="It penalizes logits already appearing in the partial completion if set to a positive value. If set to "
+             "`0.0`, it has no effect.")
+    parser.add_argument(
+        '--picollm_frequency_penalty',
+        type=float,
+        default=0.,
+        help="If set to a positive floating-point value, it penalizes logits proportional to the frequency of their "
+             "appearance in the partial completion. If set to `0.0`, it has no effect.")
+    parser.add_argument(
+        '--picollm_temperature',
+        type=float,
+        default=0.,
+        help="Sampling temperature. Temperature is a non-negative floating-point value that controls the randomness of "
+             "the sampler. A higher temperature smoothens the samplers' output, increasing the randomness. In "
+             "contrast, a lower temperature creates a narrower distribution and reduces variability. Setting it to "
+             "`0` selects the maximum logit during sampling.")
+    parser.add_argument(
+        '--picollm_top_p',
+        type=float,
+        default=1.,
+        help="A positive floating-point number within (0, 1]. It restricts the sampler's choices to high-probability "
+             "logits that form the `top_p` portion of the probability mass. Hence, it avoids randomly selecting "
+             "unlikely logits. A value of `1.` enables the sampler to pick any token with non-zero probability, "
+             "turning off the feature.")
+    parser.add_argument(
+        '--orca_warmup_sec',
+        type=float,
+        default=0.,
+        help="Duration of the synthesized audio to buffer before streaming it out. A higher value helps slower "
+             "(e.g., Raspberry Pi) to keep up with real-time at the cost of increasing the initial delay.")
+    parser.add_argument('--profile', action='store_true', help='Show runtime profiling information.')
+    parser.add_argument('--short_answers', action='store_true')
+    args = parser.parse_args()
+
+    access_key = args.access_key
+    picollm_model_path = args.picollm_model_path
+    keyword_model_path = args.keyword_model_path
+    cheetah_endpoint_duration_sec = args.cheetah_endpoint_duration_sec
+    picollm_device = args.picollm_device
+    picollm_completion_token_limit = args.picollm_completion_token_limit
+    picollm_presence_penalty = args.picollm_presence_penalty
+    picollm_frequency_penalty = args.picollm_frequency_penalty
+    picollm_temperature = args.picollm_temperature
+    picollm_top_p = args.picollm_top_p
+    orca_warmup_sec = args.orca_warmup_sec
+    profile = args.profile
+    short_answers = args.short_answers
+
+    if keyword_model_path is None:
+        porcupine = pvporcupine.create(access_key=access_key, keywords=['picovoice'])
+    else:
+        porcupine = pvporcupine.create(access_key=access_key, keyword_paths=[keyword_model_path])
+    print(f"→ Porcupine V{porcupine.version}")
+
+    cheetah = pvcheetah.create(
+        access_key=access_key,
+        endpoint_duration_sec=cheetah_endpoint_duration_sec,
+        enable_automatic_punctuation=True)
+    print(f"→ Cheetah V{cheetah.version}")
+
+    pllm = picollm.create(access_key=access_key, model_path=picollm_model_path, device=picollm_device)
+    dialog = pllm.get_dialog()
+    print(f"→ picoLLM V{pllm.version} <{pllm.model}>")
+
+    main_connection, orca_process_connection = Pipe()
+    orca_process = Process(target=orca_worker, args=(access_key, orca_process_connection, orca_warmup_sec))
+    orca_process.start()
+    while not main_connection.poll():
+        time.sleep(0.01)
+    print(f"→ Orca V{main_connection.recv()['version']}")
+
+    mic = PvRecorder(frame_length=porcupine.frame_length)
+    mic.start()
+
+    print(f"\n$ Say {'`Picovoice`' if keyword_model_path is None else 'the wake word'} ...")
+
+    stop = [False]
+
+    def handler(_, __) -> None:
+        stop[0] = True
+
+    signal.signal(signal.SIGINT, handler)
+
+    wake_word_detected = False
+    user_request = ''
+    endpoint_reached = False
+    utterance_end_sec = 0
+
+    porcupine_profiler = RTFProfiler(porcupine.sample_rate)
+    cheetah_profiler = RTFProfiler(cheetah.sample_rate)
+
+    try:
+        while True:
+            if stop[0]:
+                break
+            elif not wake_word_detected:
+                pcm = mic.read()
+                porcupine_profiler.tick()
+                wake_word_detected = porcupine.process(pcm) == 0
+                porcupine_profiler.tock(pcm)
+                if wake_word_detected:
+                    if profile:
+                        print(f"[Porcupine RTF: {porcupine_profiler.rtf():.3f}]")
+                    print("$ Wake word detected, utter your request or question ...\n")
+                    print("User > ", end='', flush=True)
+            elif not endpoint_reached:
+                pcm = mic.read()
+                cheetah_profiler.tick()
+                partial_transcript, endpoint_reached = cheetah.process(pcm)
+                cheetah_profiler.tock(pcm)
+                print(partial_transcript, end='', flush=True)
+                user_request += partial_transcript
+                if endpoint_reached:
+                    utterance_end_sec = time.perf_counter()
+                    cheetah_profiler.tick()
+                    remaining_transcript = cheetah.flush()
+                    cheetah_profiler.tock()
+                    user_request += remaining_transcript
+                    print(remaining_transcript, end='\n\n')
+                    if profile:
+                        print(f"[Cheetah RTF: {cheetah_profiler.rtf():.3f}]")
+            else:
+                short_answers_instruction = \
+                    "You are a voice assistant and your answers are very short but informative"
+                dialog.add_human_request(
+                    f"{short_answers_instruction}. {user_request}" if short_answers else user_request)
+
+                picollm_profiler = TPSProfiler()
+
+                stop_phrases = {
+                    '</s>',  # Llama-2, Mistral, and Mixtral
+                    '<end_of_turn>',  # Gemma
+                    '<|endoftext|>',  # Phi-2
+                    '<|eot_id|>',  # Llama-3
+                }
+
+                completion = ['']
+
+                def llm_callback(text: str) -> None:
+                    picollm_profiler.tock()
+                    completion[0] += text
+                    if not any(x in completion[0] for x in stop_phrases):
+                        main_connection.send({
+                            'command': 'synthesize',
+                            'text': text.replace('\n', ' . '),
+                            'utterance_end_sec': utterance_end_sec})
+                        print(text, end='', flush=True)
+
+                print("\nLLM > ", end='', flush=True)
+                res = pllm.generate(
+                    prompt=dialog.prompt(),
+                    completion_token_limit=picollm_completion_token_limit,
+                    stop_phrases=stop_phrases,
+                    presence_penalty=picollm_presence_penalty,
+                    frequency_penalty=picollm_frequency_penalty,
+                    temperature=picollm_temperature,
+                    top_p=picollm_top_p,
+                    stream_callback=llm_callback)
+                main_connection.send({'command': 'flush'})
+                print('\n')
+                dialog.add_llm_response(res.completion)
+                if profile:
+                    print(f"[picoLLM TPS: {picollm_profiler.tps():.2f}]")
+
+                while not main_connection.poll():
+                    time.sleep(0.01)
+                message = main_connection.recv()
+                if profile:
+                    print(f"[Orca RTF: {message['rtf']:.2f}]")
+                    print(f"[Delay: {message['delay']:.2f} sec]")
+                while not main_connection.poll():
+                    time.sleep(0.01)
+                assert main_connection.recv()['done']
+
+                wake_word_detected = False
+                user_request = ''
+                endpoint_reached = False
+                print(f"\n$ Say {'`Picovoice`' if keyword_model_path is None else 'the wake word'} ...")
+    finally:
+        main_connection.send({'command': 'close'})
+        mic.delete()
+        pllm.release()
+        cheetah.delete()
+        porcupine.delete()
+        orca_process.join()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/recipes/llm-voice-assistant/python/requirements.txt b/recipes/llm-voice-assistant/python/requirements.txt
new file mode 100644
index 0000000..f911465
--- /dev/null
+++ b/recipes/llm-voice-assistant/python/requirements.txt
@@ -0,0 +1,7 @@
+numpy
+picollm==1.0.0
+pvcheetah==2.0.1
+pvorca==0.2.2
+pvporcupine==3.0.2
+pvrecorder==1.2.2
+sounddevice
diff --git a/res/.lint/spell-check/dict.txt b/res/.lint/spell-check/dict.txt
index f400b3f..1893aea 100644
--- a/res/.lint/spell-check/dict.txt
+++ b/res/.lint/spell-check/dict.txt
@@ -1,2 +1,16 @@
+dtype
+endoftext
+logit
+mixtral
+numpy
 pico
-picovoice
\ No newline at end of file
+picollm
+picovoice
+pllm
+pvcheetah
+pvorca
+pvporcupine
+pvrecorder
+samplerate
+sounddevice
+tock
\ No newline at end of file