Add requirements.txt and README.md for voice chat

in German
MKLepium · Dec 10, 2023 · 58526ae · 58526ae
1 parent 6cf0cb7
commit 58526ae
Show file tree

Hide file tree

Showing 6 changed files with 102 additions and 36 deletions.
diff --git a/Lab2/README.md b/Lab2/README.md
@@ -1,5 +1,7 @@
 # ID 2223 Scalable Machine Learning and Deep Learning
 
+Link to huggingface spaces: https://huggingface.co/spaces/willeasp/voice-chat-german
+
 ## Assignment - Train Whisper Model
 
 
@@ -44,8 +46,18 @@ This process however has a few flaws:
 2. Since we are evaluating the model on a subset of the validation data, we are not getting a good representation of the models performance. Our assumption was that the model will still however over many iterations converge to a good model. This assumption was proven to be incorrect as the final model did perform similarly to the original model.
 
 
-### Something about the UI (!!!WIP!!!)
+### Service - Voice Chat in German
+
+The goal of the service is to provide a voice chat in german, using the whisper model for speech recognition, and the llama model for generated text for the chat function. 
+The google text-to-speech API is used to generate the audio that is read back to the user. 
+
+The interface is built using gradio blocks, and provides microphone and text input options.
+The chat is presented using the gradio Chatbot compoenent. 
+
+We found that we had a hard time with llama, since it often only returns a zero-width space character (`"\u200b"`), which is not visible in the chat.
+However, this is not a problem with the service, but rather with the llama model itself.
 
+The service can be found at https://huggingface.co/spaces/willeasp/voice-chat-german
 
 
 ### Evaluation for the model:
@@ -84,7 +96,7 @@ Our assumption is that the model would have needed about 100 rounds of training
 This would have taken about 200 hours to complete.
 This is why we decided to stop the training after 20 runs.
 
-### Conclusion:
+### Conclusions:
 
 Our training did not worsen or improve the model. 
 Our assumption is that we would have needed to train the model for a lot longer to see significant improvements.

diff --git a/Lab2/voice-chat-german/.gitattributes b/Lab2/voice-chat-german/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/Lab2/voice-chat-german/README.md b/Lab2/voice-chat-german/README.md
@@ -0,0 +1,12 @@
+---
+title: Voice Chat German
+emoji: 📊
+colorFrom: indigo
+colorTo: indigo
+sdk: gradio
+sdk_version: 4.8.0
+app_file: app.py
+pinned: false
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/Lab2/UI/app.py → Lab2/voice-chat-german/app.py b/Lab2/UI/app.py → Lab2/voice-chat-german/app.py
@@ -1,14 +1,10 @@
 import gradio as gr
-import random
 import numpy as np
-import time
 
 from gtts import gTTS
 from transformers import pipeline
 from llama import ask_llama
 from llama import ask_llama_yield
-from model_loader import model_loader
-from gpt4all import GPT4All
 
 from transformers import WhisperTokenizer
 from transformers import WhisperProcessor
@@ -46,14 +42,15 @@ def user(user_message, history):
 
 def create_query(history):
     #query = "This is a conversation between user and llama, a friendly chatbot. respond in simple text. NOT MARKDOWN.\n\n"
-    query = "Dies ist eine Konversation zwischen einem Nutzer und llama, einem freundlichen chatbot. antworte in einfachem text. Antworte in deutsch. \n\n"
+    query = "Dies ist eine Konversation zwischen einem Nutzer und llama, einem freundlichen chatbot. antworte in einfachem text. Antworte in deutsch.\nUser: hallo 😍\nllama: Hallo, wie kann ich Ihnen heute helfen?\n"
     for message in history:
-        query += "Nutzer: " + message[0] + "\n\nllama: " + (message[1] + "\n\n" if message[1] else "")
+        query += "Nutzer: " + message[0] + "\nllama: " + (message[1] + "\n" if message[1] else "")
     print("query: ", query)
     return query
 
 def bot(history):
     print("bot")
+    print("history", history)
     history[-1][1] = ask_llama(create_query(history))
     return history
 

diff --git a/Lab2/UI/llama.py → Lab2/voice-chat-german/llama.py b/Lab2/UI/llama.py → Lab2/voice-chat-german/llama.py
@@ -1,28 +1,35 @@
 import requests
 import json
+import string
 
 url = 'https://llama.app.cloud.cbh.kth.se/completion'
-headers = {
-    'authority': 'llama.app.cloud.cbh.kth.se',
-    'accept': 'text/event-stream',
-    'accept-language': 'sv,en;q=0.9,en-GB;q=0.8,en-US;q=0.7',
-    'content-type': 'application/json',
-    'origin': 'https://llama.app.cloud.cbh.kth.se',
-    'referer': 'https://llama.app.cloud.cbh.kth.se/',
-    'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
-    'sec-ch-ua-mobile': '?0',
-    'sec-ch-ua-platform': '"macOS"',
-    'sec-fetch-dest': 'empty',
-    'sec-fetch-mode': 'cors',
-    'sec-fetch-site': 'same-origin',
-    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69',
-}
+headers = {}
+# headers = {
+#     'authority': 'llama.app.cloud.cbh.kth.se',
+#     'accept': 'text/event-stream',
+#     'accept-language': 'sv,en;q=0.9,en-GB;q=0.8,en-US;q=0.7',
+#     'content-type': 'application/json',
+#     'origin': 'https://llama.app.cloud.cbh.kth.se',
+#     'referer': 'https://llama.app.cloud.cbh.kth.se/',
+#     'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
+#     'sec-ch-ua-mobile': '?0',
+#     'sec-ch-ua-platform': '"macOS"',
+#     'sec-fetch-dest': 'empty',
+#     'sec-fetch-mode': 'cors',
+#     'sec-fetch-site': 'same-origin',
+#     'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69',
+# }
+
+USERTOKEN="<|prompter|>"
+ASSISTANTTOKEN="<|assistant|>"
+ENDTOKEN="<|endoftext|>"
+PUBLIC_PREPROMPT="Below are a series of dialogues between various people and an AI assistant. The AI tries to be helpful, polite, honest, sophisticated, emotionally aware, and humble-but-knowledgeable. The assistant is happy to help with almost anything, and will do its best to understand exactly what is needed. It also tries to avoid giving false or misleading information, and it caveats when it isn't entirely sure about the right answer. That said, the assistant is practical and really does its best, and doesn't let caution get too much in the way of being useful."
+# PUBLIC_PREPROMPT="Nachfolgend finden Sie eine Reihe von Dialogen zwischen verschiedenen Personen und einem KI-Assistenten. Die KI versucht hilfsbereit, höflich, ehrlich, kultiviert, emotional bewusst und bescheiden, aber kenntnisreich zu sein. Der Assistent hilft Ihnen gerne bei fast allem und wird sein Bestes tun, um genau zu verstehen, was benötigt wird. Außerdem wird versucht, falsche oder irreführende Informationen zu vermeiden, und es gibt Vorbehalte, wenn man sich über die richtige Antwort nicht ganz sicher ist. Allerdings ist der Assistent praktisch und gibt wirklich sein Bestes und lässt die Vorsicht nicht zu sehr in die Quere kommen."
 
 data = {
     "stream": True,
     "n_predict": 400,
     "temperature": 0.7,
-    # "temperature": 10.0,
     "stop": ["</s>", "llama:", "Nutzer:"],
     "repeat_last_n": 256,
     "repeat_penalty": 1.18,
@@ -37,24 +44,22 @@
     "mirostat_eta": 0.1,
     "grammar": "",
     "prompt": "",
+    'ban_eos_token': False,
+    'skip_special_tokens': True,
+    'stopping_strings': [ENDTOKEN, f"{USERTOKEN.strip()}", f"{USERTOKEN.strip()}:", f"{ENDTOKEN}{USERTOKEN.strip()}", f"{ENDTOKEN}{USERTOKEN.strip()}:", f"{ASSISTANTTOKEN.strip()}", f"{ASSISTANTTOKEN.strip()}:", f"{ENDTOKEN}{ASSISTANTTOKEN.strip()}:", f"{ENDTOKEN}{ASSISTANTTOKEN.strip()}"],
 }
 
-import string
 printable = string.ascii_letters + string.digits + string.punctuation + ' '
 def hex_escape(s):
     return ''.join(c if c in printable else r'\x{0:02x}'.format(ord(c)) for c in s)
 
 def ask_llama(query):
-    # conversation.append("User: " + question + "\n\nLlama: ")
-    # prompt = "".join(conversation)
-    # print("Prompt: " + prompt)
-
-    data["prompt"] = query
+    data["prompt"] = PUBLIC_PREPROMPT + query
 
     result = []
     with requests.Session() as session:
         # Send the initial request
-        response = session.post(url, headers=headers, json=data, stream=True, verify=False)
+        response = session.post(url, headers=headers, json=data, stream=True, verify=True)
 
         # Check for a successful connection
         if response.status_code == 200:
@@ -63,7 +68,7 @@ def ask_llama(query):
             # Iterate over the lines of the response content
             for line in response.iter_lines(decode_unicode=False):
                 if line:
-                    # print(line)
+                    print("Line:", line)
                     utf8_line = line.decode('utf-8')
                     line_data = json.loads(utf8_line[5:])  # Remove "data: " prefix and parse JSON
                     content = line_data.get("content")
@@ -74,14 +79,11 @@ def ask_llama(query):
         else:
             print(f"Request failed with status code {response.status_code}: {response.text}")
     result = "".join(result)
+    print("Result: " + result)
     # conversation.append(result + "\n\n")
     return result
 
 def ask_llama_yield(query):
-    # conversation.append("User: " + question + "\n\nLlama: ")
-    # prompt = "".join(conversation)
-    # print("Prompt: " + prompt)
-
     data["prompt"] = query
 
     with requests.Session() as session:

diff --git a/Lab2/voice-chat-german/requirements.txt b/Lab2/voice-chat-german/requirements.txt
@@ -0,0 +1,8 @@
+gradio==4.8.0
+numpy==1.26.2
+gTTS==2.4.0
+transformers==4.35.2
+requests==2.31.0
+torch==2.1.1
+torchaudio==2.1.1
+torchvision==0.16.1