diff --git a/llm_ft/playground/test_embedding/README.md b/llm_ft/playground/test_embedding/README.md deleted file mode 100644 index 57ac73c5..00000000 --- a/llm_ft/playground/test_embedding/README.md +++ /dev/null @@ -1,15 +0,0 @@ -## Machine Learning with Embeddings -You can use embeddings to -- Evaluate text similarity, see [test_sentence_similarity.py](test_sentence_similarity.py) -- Build your own classifier, see [test_classification.py](test_classification.py) -- Search relative texts, see [test_semantic_search.py](test_semantic_search.py) - -To these tests, you need to download the data [here](https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews). You also need an OpenAI API key for comparison. - -Run with: -```bash -cd playground/test_embedding -python3 test_classification.py -``` - -The script will train classifiers based on `vicuna-7b`, `text-similarity-ada-001` and `text-embedding-ada-002` and report the accuracy of each classifier. diff --git a/llm_ft/playground/test_embedding/test_classification.py b/llm_ft/playground/test_embedding/test_classification.py deleted file mode 100644 index 393827bb..00000000 --- a/llm_ft/playground/test_embedding/test_classification.py +++ /dev/null @@ -1,83 +0,0 @@ -import json -import os - -import numpy as np -import openai -import pandas as pd -import requests -from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import train_test_split -from sklearn.metrics import classification_report, accuracy_score - - -np.set_printoptions(threshold=10000) - - -def get_embedding_from_api(word, model="vicuna-7b-v1.1"): - if "ada" in model: - resp = openai.Embedding.create( - model=model, - input=word, - ) - embedding = np.array(resp["data"][0]["embedding"]) - return embedding - - url = "http://localhost:8000/v1/embeddings" - headers = {"Content-Type": "application/json"} - data = json.dumps({"model": model, "input": word}) - - response = requests.post(url, headers=headers, data=data) - if response.status_code == 200: - embedding = np.array(response.json()["data"][0]["embedding"]) - return embedding - else: - print(f"Error: {response.status_code} - {response.text}") - return None - - -def create_embedding_data_frame(data_path, model, max_tokens=500): - df = pd.read_csv(data_path, index_col=0) - df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]] - df = df.dropna() - df["combined"] = ( - "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip() - ) - top_n = 1000 - df = df.sort_values("Time").tail(top_n * 2) - df.drop("Time", axis=1, inplace=True) - - df["n_tokens"] = df.combined.apply(lambda x: len(x)) - df = df[df.n_tokens <= max_tokens].tail(top_n) - df["embedding"] = df.combined.apply(lambda x: get_embedding_from_api(x, model)) - return df - - -def train_random_forest(df): - X_train, X_test, y_train, y_test = train_test_split( - list(df.embedding.values), df.Score, test_size=0.2, random_state=42 - ) - - clf = RandomForestClassifier(n_estimators=100) - clf.fit(X_train, y_train) - preds = clf.predict(X_test) - - report = classification_report(y_test, preds) - accuracy = accuracy_score(y_test, preds) - return clf, accuracy, report - - -input_datapath = "amazon_fine_food_review.csv" -if not os.path.exists(input_datapath): - raise Exception( - f"Please download data from: https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews" - ) - -df = create_embedding_data_frame(input_datapath, "vicuna-7b-v1.1") -clf, accuracy, report = train_random_forest(df) -print(f"Vicuna-7b-v1.1 accuracy:{accuracy}") -df = create_embedding_data_frame(input_datapath, "text-similarity-ada-001") -clf, accuracy, report = train_random_forest(df) -print(f"text-similarity-ada-001 accuracy:{accuracy}") -df = create_embedding_data_frame(input_datapath, "text-embedding-ada-002") -clf, accuracy, report = train_random_forest(df) -print(f"text-embedding-ada-002 accuracy:{accuracy}") diff --git a/llm_ft/playground/test_embedding/test_semantic_search.py b/llm_ft/playground/test_embedding/test_semantic_search.py deleted file mode 100644 index 879b240b..00000000 --- a/llm_ft/playground/test_embedding/test_semantic_search.py +++ /dev/null @@ -1,99 +0,0 @@ -import json -import os - -import numpy as np -import openai -import pandas as pd -import requests -from scipy.spatial.distance import cosine - - -def cosine_similarity(vec1, vec2): - try: - return 1 - cosine(vec1, vec2) - except: - print(vec1.shape, vec2.shape) - - -def get_embedding_from_api(word, model="vicuna-7b-v1.1"): - if "ada" in model: - resp = openai.Embedding.create( - model=model, - input=word, - ) - embedding = np.array(resp["data"][0]["embedding"]) - return embedding - - url = "http://localhost:8000/v1/embeddings" - headers = {"Content-Type": "application/json"} - data = json.dumps({"model": model, "input": word}) - - response = requests.post(url, headers=headers, data=data) - if response.status_code == 200: - embedding = np.array(response.json()["data"][0]["embedding"]) - return embedding - else: - print(f"Error: {response.status_code} - {response.text}") - return None - - -def create_embedding_data_frame(data_path, model, max_tokens=500): - df = pd.read_csv(data_path, index_col=0) - df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]] - df = df.dropna() - df["combined"] = ( - "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip() - ) - top_n = 1000 - df = df.sort_values("Time").tail(top_n * 2) - df.drop("Time", axis=1, inplace=True) - - df["n_tokens"] = df.combined.apply(lambda x: len(x)) - df = df[df.n_tokens <= max_tokens].tail(top_n) - df["embedding"] = df.combined.apply(lambda x: get_embedding_from_api(x, model)) - return df - - -def search_reviews(df, product_description, n=3, pprint=False, model="vicuna-7b-v1.1"): - product_embedding = get_embedding_from_api(product_description, model=model) - df["similarity"] = df.embedding.apply( - lambda x: cosine_similarity(x, product_embedding) - ) - - results = ( - df.sort_values("similarity", ascending=False) - .head(n) - .combined.str.replace("Title: ", "") - .str.replace("; Content:", ": ") - ) - if pprint: - for r in results: - print(r[:200]) - print() - return results - - -def print_model_search(input_path, model): - print(f"Model: {model}") - df = create_embedding_data_frame(input_path, model) - print("search: delicious beans") - results = search_reviews(df, "delicious beans", n=5, model=model) - print(results) - print("search: whole wheat pasta") - results = search_reviews(df, "whole wheat pasta", n=5, model=model) - print(results) - print("search: bad delivery") - results = search_reviews(df, "bad delivery", n=5, model=model) - print(results) - - -input_datapath = "amazon_fine_food_review.csv" -if not os.path.exists(input_datapath): - raise Exception( - f"Please download data from: https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews" - ) - - -print_model_search(input_datapath, "vicuna-7b-v1.1") -print_model_search(input_datapath, "text-similarity-ada-001") -print_model_search(input_datapath, "text-embedding-ada-002") diff --git a/llm_ft/playground/test_embedding/test_sentence_similarity.py b/llm_ft/playground/test_embedding/test_sentence_similarity.py deleted file mode 100644 index 0b9a5408..00000000 --- a/llm_ft/playground/test_embedding/test_sentence_similarity.py +++ /dev/null @@ -1,67 +0,0 @@ -import json -import os - -import numpy as np -import openai -import requests -from scipy.spatial.distance import cosine - - -def get_embedding_from_api(word, model="vicuna-7b-v1.1"): - if "ada" in model: - resp = openai.Embedding.create( - model=model, - input=word, - ) - embedding = np.array(resp["data"][0]["embedding"]) - return embedding - - url = "http://localhost:8000/v1/embeddings" - headers = {"Content-Type": "application/json"} - data = json.dumps({"model": model, "input": word}) - - response = requests.post(url, headers=headers, data=data) - if response.status_code == 200: - embedding = np.array(response.json()["data"][0]["embedding"]) - return embedding - else: - print(f"Error: {response.status_code} - {response.text}") - return None - - -def cosine_similarity(vec1, vec2): - return 1 - cosine(vec1, vec2) - - -def print_cosine_similarity(embeddings, texts): - for i in range(len(texts)): - for j in range(i + 1, len(texts)): - sim = cosine_similarity(embeddings[texts[i]], embeddings[texts[j]]) - print(f"Cosine similarity between '{texts[i]}' and '{texts[j]}': {sim:.2f}") - - -texts = [ - "The quick brown fox", - "The quick brown dog", - "The fast brown fox", - "A completely different sentence", -] - -embeddings = {} -for text in texts: - embeddings[text] = get_embedding_from_api(text) - -print("Vicuna-7B:") -print_cosine_similarity(embeddings, texts) - -for text in texts: - embeddings[text] = get_embedding_from_api(text, model="text-similarity-ada-001") - -print("text-similarity-ada-001:") -print_cosine_similarity(embeddings, texts) - -for text in texts: - embeddings[text] = get_embedding_from_api(text, model="text-embedding-ada-002") - -print("text-embedding-ada-002:") -print_cosine_similarity(embeddings, texts) diff --git a/llm_ft/pyproject.toml b/llm_ft/pyproject.toml index b7109f05..1e6cdf58 100644 --- a/llm_ft/pyproject.toml +++ b/llm_ft/pyproject.toml @@ -21,7 +21,7 @@ dependencies = [ [project.optional-dependencies] model_worker = ["accelerate>=0.21", "peft", "sentencepiece", "torch", "transformers>=4.31.0", "protobuf"] webui = ["gradio"] -train = ["einops", "flash-attn>=2.0", "wandb"] +train = ["einops", "flash-attn>=2.0", "wandb", "deepspeed", "peft", "bitsandbytes", "scipy", "sentencepiece"] llm_judge = ["openai", "anthropic>=0.3", "ray"] dev = ["black==23.3.0", "pylint==2.8.2"] diff --git a/llm_ft/qlora.sh b/llm_ft/qlora.sh new file mode 100644 index 00000000..ddd9a752 --- /dev/null +++ b/llm_ft/qlora.sh @@ -0,0 +1,25 @@ +deepspeed fastchat/train/train_lora.py \ + --model_name_or_path ./vicuna-7b-1.5 \ + --lora_r 8 \ + --lora_alpha 16 \ + --lora_dropout 0.05 \ + --data_path ./data/dummy_conversation.json \ + --bf16 True \ + --output_dir ./checkpoints \ + --num_train_epochs 3 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 1200 \ + --save_total_limit 100 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 True \ + --model_max_length 2048 \ + --q_lora True \ + --deepspeed playground/deepspeed_config_s2.json \ No newline at end of file