diff --git a/Lab2/README.md b/Lab2/README.md index 91c252f..1614906 100644 --- a/Lab2/README.md +++ b/Lab2/README.md @@ -1,5 +1,7 @@ # ID 2223 Scalable Machine Learning and Deep Learning +Link to huggingface spaces: https://huggingface.co/spaces/willeasp/voice-chat-german + ## Assignment - Train Whisper Model @@ -44,8 +46,18 @@ This process however has a few flaws: 2. Since we are evaluating the model on a subset of the validation data, we are not getting a good representation of the models performance. Our assumption was that the model will still however over many iterations converge to a good model. This assumption was proven to be incorrect as the final model did perform similarly to the original model. -### Something about the UI (!!!WIP!!!) +### Service - Voice Chat in German + +The goal of the service is to provide a voice chat in german, using the whisper model for speech recognition, and the llama model for generated text for the chat function. +The google text-to-speech API is used to generate the audio that is read back to the user. + +The interface is built using gradio blocks, and provides microphone and text input options. +The chat is presented using the gradio Chatbot compoenent. + +We found that we had a hard time with llama, since it often only returns a zero-width space character (`"\u200b"`), which is not visible in the chat. +However, this is not a problem with the service, but rather with the llama model itself. +The service can be found at https://huggingface.co/spaces/willeasp/voice-chat-german ### Evaluation for the model: @@ -84,7 +96,7 @@ Our assumption is that the model would have needed about 100 rounds of training This would have taken about 200 hours to complete. This is why we decided to stop the training after 20 runs. -### Conclusion: +### Conclusions: Our training did not worsen or improve the model. Our assumption is that we would have needed to train the model for a lot longer to see significant improvements. diff --git a/Lab2/voice-chat-german/.gitattributes b/Lab2/voice-chat-german/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/Lab2/voice-chat-german/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Lab2/voice-chat-german/README.md b/Lab2/voice-chat-german/README.md new file mode 100644 index 0000000..7ac4d49 --- /dev/null +++ b/Lab2/voice-chat-german/README.md @@ -0,0 +1,12 @@ +--- +title: Voice Chat German +emoji: 📊 +colorFrom: indigo +colorTo: indigo +sdk: gradio +sdk_version: 4.8.0 +app_file: app.py +pinned: false +--- + +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/Lab2/UI/app.py b/Lab2/voice-chat-german/app.py similarity index 93% rename from Lab2/UI/app.py rename to Lab2/voice-chat-german/app.py index ec2a837..da0b16f 100644 --- a/Lab2/UI/app.py +++ b/Lab2/voice-chat-german/app.py @@ -1,14 +1,10 @@ import gradio as gr -import random import numpy as np -import time from gtts import gTTS from transformers import pipeline from llama import ask_llama from llama import ask_llama_yield -from model_loader import model_loader -from gpt4all import GPT4All from transformers import WhisperTokenizer from transformers import WhisperProcessor @@ -46,14 +42,15 @@ def user(user_message, history): def create_query(history): #query = "This is a conversation between user and llama, a friendly chatbot. respond in simple text. NOT MARKDOWN.\n\n" - query = "Dies ist eine Konversation zwischen einem Nutzer und llama, einem freundlichen chatbot. antworte in einfachem text. Antworte in deutsch. \n\n" + query = "Dies ist eine Konversation zwischen einem Nutzer und llama, einem freundlichen chatbot. antworte in einfachem text. Antworte in deutsch.\nUser: hallo 😍\nllama: Hallo, wie kann ich Ihnen heute helfen?\n" for message in history: - query += "Nutzer: " + message[0] + "\n\nllama: " + (message[1] + "\n\n" if message[1] else "") + query += "Nutzer: " + message[0] + "\nllama: " + (message[1] + "\n" if message[1] else "") print("query: ", query) return query def bot(history): print("bot") + print("history", history) history[-1][1] = ask_llama(create_query(history)) return history diff --git a/Lab2/UI/llama.py b/Lab2/voice-chat-german/llama.py similarity index 51% rename from Lab2/UI/llama.py rename to Lab2/voice-chat-german/llama.py index 8cc1f29..ea99d69 100644 --- a/Lab2/UI/llama.py +++ b/Lab2/voice-chat-german/llama.py @@ -1,28 +1,35 @@ import requests import json +import string url = 'https://llama.app.cloud.cbh.kth.se/completion' -headers = { - 'authority': 'llama.app.cloud.cbh.kth.se', - 'accept': 'text/event-stream', - 'accept-language': 'sv,en;q=0.9,en-GB;q=0.8,en-US;q=0.7', - 'content-type': 'application/json', - 'origin': 'https://llama.app.cloud.cbh.kth.se', - 'referer': 'https://llama.app.cloud.cbh.kth.se/', - 'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"macOS"', - 'sec-fetch-dest': 'empty', - 'sec-fetch-mode': 'cors', - 'sec-fetch-site': 'same-origin', - 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69', -} +headers = {} +# headers = { +# 'authority': 'llama.app.cloud.cbh.kth.se', +# 'accept': 'text/event-stream', +# 'accept-language': 'sv,en;q=0.9,en-GB;q=0.8,en-US;q=0.7', +# 'content-type': 'application/json', +# 'origin': 'https://llama.app.cloud.cbh.kth.se', +# 'referer': 'https://llama.app.cloud.cbh.kth.se/', +# 'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"', +# 'sec-ch-ua-mobile': '?0', +# 'sec-ch-ua-platform': '"macOS"', +# 'sec-fetch-dest': 'empty', +# 'sec-fetch-mode': 'cors', +# 'sec-fetch-site': 'same-origin', +# 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69', +# } + +USERTOKEN="<|prompter|>" +ASSISTANTTOKEN="<|assistant|>" +ENDTOKEN="<|endoftext|>" +PUBLIC_PREPROMPT="Below are a series of dialogues between various people and an AI assistant. The AI tries to be helpful, polite, honest, sophisticated, emotionally aware, and humble-but-knowledgeable. The assistant is happy to help with almost anything, and will do its best to understand exactly what is needed. It also tries to avoid giving false or misleading information, and it caveats when it isn't entirely sure about the right answer. That said, the assistant is practical and really does its best, and doesn't let caution get too much in the way of being useful." +# PUBLIC_PREPROMPT="Nachfolgend finden Sie eine Reihe von Dialogen zwischen verschiedenen Personen und einem KI-Assistenten. Die KI versucht hilfsbereit, höflich, ehrlich, kultiviert, emotional bewusst und bescheiden, aber kenntnisreich zu sein. Der Assistent hilft Ihnen gerne bei fast allem und wird sein Bestes tun, um genau zu verstehen, was benötigt wird. Außerdem wird versucht, falsche oder irreführende Informationen zu vermeiden, und es gibt Vorbehalte, wenn man sich über die richtige Antwort nicht ganz sicher ist. Allerdings ist der Assistent praktisch und gibt wirklich sein Bestes und lässt die Vorsicht nicht zu sehr in die Quere kommen." data = { "stream": True, "n_predict": 400, "temperature": 0.7, - # "temperature": 10.0, "stop": ["", "llama:", "Nutzer:"], "repeat_last_n": 256, "repeat_penalty": 1.18, @@ -37,24 +44,22 @@ "mirostat_eta": 0.1, "grammar": "", "prompt": "", + 'ban_eos_token': False, + 'skip_special_tokens': True, + 'stopping_strings': [ENDTOKEN, f"{USERTOKEN.strip()}", f"{USERTOKEN.strip()}:", f"{ENDTOKEN}{USERTOKEN.strip()}", f"{ENDTOKEN}{USERTOKEN.strip()}:", f"{ASSISTANTTOKEN.strip()}", f"{ASSISTANTTOKEN.strip()}:", f"{ENDTOKEN}{ASSISTANTTOKEN.strip()}:", f"{ENDTOKEN}{ASSISTANTTOKEN.strip()}"], } -import string printable = string.ascii_letters + string.digits + string.punctuation + ' ' def hex_escape(s): return ''.join(c if c in printable else r'\x{0:02x}'.format(ord(c)) for c in s) def ask_llama(query): - # conversation.append("User: " + question + "\n\nLlama: ") - # prompt = "".join(conversation) - # print("Prompt: " + prompt) - - data["prompt"] = query + data["prompt"] = PUBLIC_PREPROMPT + query result = [] with requests.Session() as session: # Send the initial request - response = session.post(url, headers=headers, json=data, stream=True, verify=False) + response = session.post(url, headers=headers, json=data, stream=True, verify=True) # Check for a successful connection if response.status_code == 200: @@ -63,7 +68,7 @@ def ask_llama(query): # Iterate over the lines of the response content for line in response.iter_lines(decode_unicode=False): if line: - # print(line) + print("Line:", line) utf8_line = line.decode('utf-8') line_data = json.loads(utf8_line[5:]) # Remove "data: " prefix and parse JSON content = line_data.get("content") @@ -74,14 +79,11 @@ def ask_llama(query): else: print(f"Request failed with status code {response.status_code}: {response.text}") result = "".join(result) + print("Result: " + result) # conversation.append(result + "\n\n") return result def ask_llama_yield(query): - # conversation.append("User: " + question + "\n\nLlama: ") - # prompt = "".join(conversation) - # print("Prompt: " + prompt) - data["prompt"] = query with requests.Session() as session: diff --git a/Lab2/voice-chat-german/requirements.txt b/Lab2/voice-chat-german/requirements.txt new file mode 100644 index 0000000..07f5372 --- /dev/null +++ b/Lab2/voice-chat-german/requirements.txt @@ -0,0 +1,8 @@ +gradio==4.8.0 +numpy==1.26.2 +gTTS==2.4.0 +transformers==4.35.2 +requests==2.31.0 +torch==2.1.1 +torchaudio==2.1.1 +torchvision==0.16.1 \ No newline at end of file