From 669997a6baf0979f9414e1f45bc8363ad07035c1 Mon Sep 17 00:00:00 2001 From: aasavari Date: Thu, 19 Sep 2024 04:33:00 +0000 Subject: [PATCH 01/32] minimized required fields/columns in user data Signed-off-by: aasavari --- evals/metrics/ragas/ragas.py | 80 +++++++++++++++--------------------- 1 file changed, 34 insertions(+), 46 deletions(-) diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py index 35449c08..672ac39e 100644 --- a/evals/metrics/ragas/ragas.py +++ b/evals/metrics/ragas/ragas.py @@ -1,24 +1,23 @@ + #!/usr/bin/env python # -*- coding: utf-8 -*- # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 - # import os from typing import Dict, Optional, Union - +from langchain_huggingface import HuggingFaceEndpoint from langchain_core.embeddings import Embeddings from langchain_core.language_models import BaseLanguageModel -from langchain_huggingface import HuggingFaceEndpoint +import sys +sys.path.append('/home/akakne/miniforge3/envs/recsys/bin') def format_ragas_metric_name(name: str): return f"{name} (ragas)" - class RagasMetric: """This metric checks if the output is more than 3 letters.""" - def __init__( self, threshold: float = 0.3, @@ -26,7 +25,6 @@ def __init__( embeddings: Optional[Embeddings] = None, metrics: Optional[list[str]] = None, ): - self.threshold = threshold self.model = model self.embeddings = embeddings @@ -39,10 +37,14 @@ def __init__( "context_recall", "faithfulness", "context_utilization", - "reference_free_rubrics_score", + # "reference_free_rubrics_score", ] - + async def a_measure(self, test_case: Dict): + return self.measure(test_case) + def measure(self, test_case: Dict): + # sends to server try: + from ragas import evaluate from ragas.metrics import ( answer_correctness, answer_relevancy, @@ -51,16 +53,14 @@ def __init__( context_recall, context_utilization, faithfulness, - reference_free_rubrics_score, + # reference_free_rubrics_score, ) except ModuleNotFoundError: raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.") - try: from datasets import Dataset except ModuleNotFoundError: raise ModuleNotFoundError("Please install dataset") - self.metrics_instance = { "answer_correctness": answer_correctness, "answer_relevancy": answer_relevancy, @@ -69,26 +69,24 @@ def __init__( "context_recall": context_recall, "faithfulness": faithfulness, "context_utilization": context_utilization, - "reference_free_rubrics_score": reference_free_rubrics_score, + # "reference_free_rubrics_score": reference_free_rubrics_score, } - # Set LLM model openai_key = os.getenv("OPENAI_API_KEY", None) if openai_key is not None: print("OPENAI_API_KEY is provided, ragas initializes the model by OpenAI.") self.model = None if isinstance(self.model, str): - print("LLM endpoint: ", self.model) - self.chat_model = HuggingFaceEndpoint( + print("Loading a HuggingFace Endpoint") + chat_model = HuggingFaceEndpoint( endpoint_url=self.model, - task="text-generation", - max_new_tokens=1024, - do_sample=False, + timeout=600, ) else: - self.chat_model = self.model - - # initialize metrics + print("Accepting user-initialized model as we could not detect OpenAI key or HuggingFace Endpoint URL.") + chat_model = self.model + # Create a dataset from the test case + # Convert the Dict to a format compatible with Dataset if self.metrics is not None: tmp_metrics = [] # check supported list @@ -106,10 +104,8 @@ def __init__( if metric == "answer_relevancy" and self.embeddings is None: raise ValueError("answer_relevancy metric need provide embeddings model.") tmp_metrics.append(self.metrics_instance[metric]) - self.metrics = tmp_metrics - - else: # default metrics + else: self.metrics = [ answer_relevancy, faithfulness, @@ -118,39 +114,31 @@ def __init__( context_precision, context_recall, ] - - async def a_measure(self, test_case: Dict): - return self.measure(test_case) - - def measure(self, test_case: Dict): - from ragas import evaluate - - try: - from datasets import Dataset - except ModuleNotFoundError: - raise ModuleNotFoundError("Please install dataset") - - # Create a dataset from the test case - # Convert the Dict to a format compatible with Dataset - data = { - "question": test_case["question"], - "contexts": test_case["contexts"], - "answer": test_case["answer"], - "ground_truth": test_case["ground_truth"], + # Find necessary input fields using the given metrics + _required_columns = set() + for metric in self.metrics: + for column in list(metric._required_columns.values())[0]: + _required_columns.add(column) + column2field = { + "user_input" : "question", + "response" : "answer", + "reference" : "ground_truth", + "retrieved_contexts" : "contexts" } + _required_fields = [column2field[column] for column in _required_columns] + data = {field : test_case[field] for field in _required_fields} dataset = Dataset.from_dict(data) + # evaluate self.score = evaluate( dataset, metrics=self.metrics, - llm=self.chat_model, + llm=chat_model, embeddings=self.embeddings, ) return self.score - def is_successful(self): return self.success - @property def __name__(self): return "RAGAS" From 80e21609655529e8cd40354a64935417d4b70feb Mon Sep 17 00:00:00 2001 From: Ying Chun Guo Date: Thu, 19 Sep 2024 14:33:11 +0800 Subject: [PATCH 02/32] add bench-target as the prefix of output folder (#133) Signed-off-by: Yingchun Guo Signed-off-by: aasavari --- evals/benchmark/stresscli/commands/load_test.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/evals/benchmark/stresscli/commands/load_test.py b/evals/benchmark/stresscli/commands/load_test.py index fba1c5ee..8895e5ab 100644 --- a/evals/benchmark/stresscli/commands/load_test.py +++ b/evals/benchmark/stresscli/commands/load_test.py @@ -46,16 +46,17 @@ def locust_runtests(kubeconfig, profile): with open(profile, "r") as file: profile_data = yaml.safe_load(file) + global_settings = profile_data["profile"]["global-settings"] + runs = profile_data["profile"]["runs"] + # create test log folder hostpath = profile_data["profile"]["storage"]["hostpath"] timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - base_folder = os.path.join(hostpath, f"{timestamp}") + testtarget = global_settings.get("bench-target", locust_defaults["bench-target"]) + base_folder = os.path.join(hostpath, f"{testtarget}_{timestamp}") os.makedirs(base_folder, exist_ok=True) # Extract storage path and run details from profile - global_settings = profile_data["profile"]["global-settings"] - runs = profile_data["profile"]["runs"] - index = 1 for run in runs: print(f"===Starting test: {run['name']}") From eb98d2e32f73b1612d1d84e92e15a2c8b8505a2b Mon Sep 17 00:00:00 2001 From: lkk <33276950+lkk12014402@users.noreply.github.com> Date: Thu, 19 Sep 2024 16:11:01 +0800 Subject: [PATCH 03/32] remove examples. (#135) Co-authored-by: root Signed-off-by: aasavari --- examples/AudioQnA/README.md | 48 --------------- examples/AudioQnA/local_eval.py | 35 ----------- examples/AudioQnA/online_eval.py | 56 ------------------ examples/AudioQnA/requirements.txt | 8 --- examples/CodeGen/README.md | 92 ----------------------------- examples/FaqGen/README.md | 63 -------------------- examples/FaqGen/evaluate.py | 45 -------------- examples/FaqGen/generate_FAQ.py | 28 --------- examples/FaqGen/get_context.py | 17 ------ examples/FaqGen/launch_tgi.sh | 28 --------- examples/FaqGen/post_process_FAQ.py | 27 --------- 11 files changed, 447 deletions(-) delete mode 100644 examples/AudioQnA/README.md delete mode 100644 examples/AudioQnA/local_eval.py delete mode 100644 examples/AudioQnA/online_eval.py delete mode 100644 examples/AudioQnA/requirements.txt delete mode 100644 examples/CodeGen/README.md delete mode 100644 examples/FaqGen/README.md delete mode 100644 examples/FaqGen/evaluate.py delete mode 100644 examples/FaqGen/generate_FAQ.py delete mode 100644 examples/FaqGen/get_context.py delete mode 100644 examples/FaqGen/launch_tgi.sh delete mode 100644 examples/FaqGen/post_process_FAQ.py diff --git a/examples/AudioQnA/README.md b/examples/AudioQnA/README.md deleted file mode 100644 index 918a7997..00000000 --- a/examples/AudioQnA/README.md +++ /dev/null @@ -1,48 +0,0 @@ -# AudioQnA accuracy Evaluation - -## Dataset - - -We evaluate the ASR accuracy on the test set of librispeech [dataset](andreagasparini/librispeech_test_only), which contains 2620 records of audio and texts. - -## Metrics - -We evaluate the WER (Word Error Rate) metric of the ASR microservice. - -## Evaluation - -### Launch ASR microservice - -Launch the ASR microserice with the following commands. For more details please refer to [doc](https://github.com/opea-project/GenAIComps/tree/main/comps/asr). - -```bash -git clone https://github.com/opea-project/GenAIComps -cd GenAIComps -docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile . -# change the name of model by editing model_name_or_path you want to evaluate -docker run -p 7066:7066 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest --model_name_or_path "openai/whisper-tiny" -``` - -### Evaluate - -Install dependencies: - -``` -pip install -r requirements.txt -``` - -Evaluate the performance with the LLM: -```py -# validate the offline model -# python offline_evaluate.py -# validate the online asr microservice accuracy -python online_evaluate.py -``` - -### Performance Result -Here is the tested result for your reference -|| WER | -| --- | ---- | -|whisper-large-v2| 2.87| -|whisper-large| 2.7 | -|whisper-medium| 3.45 | diff --git a/examples/AudioQnA/local_eval.py b/examples/AudioQnA/local_eval.py deleted file mode 100644 index 1ef7b6df..00000000 --- a/examples/AudioQnA/local_eval.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import torch -from datasets import load_dataset -from evaluate import load -from transformers import WhisperForConditionalGeneration, WhisperProcessor - -device = "cuda" if torch.cuda.is_available() else "cpu" - -MODEL_NAME = "openai/whisper-large-v2" - -librispeech_test_clean = load_dataset( - "andreagasparini/librispeech_test_only", "clean", split="test", trust_remote_code=True -) -processor = WhisperProcessor.from_pretrained(MODEL_NAME) -model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device) - - -def map_to_pred(batch): - audio = batch["audio"] - input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features - batch["reference"] = processor.tokenizer._normalize(batch["text"]) - - with torch.no_grad(): - predicted_ids = model.generate(input_features.to(device))[0] - transcription = processor.decode(predicted_ids) - batch["prediction"] = processor.tokenizer._normalize(transcription) - return batch - - -result = librispeech_test_clean.map(map_to_pred) - -wer = load("wer") -print(100 * wer.compute(references=result["reference"], predictions=result["prediction"])) diff --git a/examples/AudioQnA/online_eval.py b/examples/AudioQnA/online_eval.py deleted file mode 100644 index a7854c95..00000000 --- a/examples/AudioQnA/online_eval.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import base64 -import json - -import requests -import torch -from datasets import load_dataset -from evaluate import load -from pydub import AudioSegment -from transformers import WhisperForConditionalGeneration, WhisperProcessor - -MODEL_NAME = "openai/whisper-large-v2" -processor = WhisperProcessor.from_pretrained(MODEL_NAME) - -librispeech_test_clean = load_dataset( - "andreagasparini/librispeech_test_only", "clean", split="test", trust_remote_code=True -) - - -def map_to_pred(batch): - batch["reference"] = processor.tokenizer._normalize(batch["text"]) - - file_path = batch["file"] - # process the file_path - pidx = file_path.rfind("/") - sidx = file_path.rfind(".") - - file_path_prefix = file_path[: pidx + 1] - file_path_suffix = file_path[sidx:] - file_path_mid = file_path[pidx + 1 : sidx] - splits = file_path_mid.split("-") - file_path_mid = f"LibriSpeech/test-clean/{splits[0]}/{splits[1]}/{file_path_mid}" - - file_path = file_path_prefix + file_path_mid + file_path_suffix - - audio = AudioSegment.from_file(file_path) - audio.export("tmp.wav") - with open("tmp.wav", "rb") as f: - test_audio_base64_str = base64.b64encode(f.read()).decode("utf-8") - - inputs = {"audio": test_audio_base64_str} - endpoint = "http://localhost:7066/v1/asr" - response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None}) - - result_str = response.json()["asr_result"] - - batch["prediction"] = processor.tokenizer._normalize(result_str) - return batch - - -result = librispeech_test_clean.map(map_to_pred) - -wer = load("wer") -print(100 * wer.compute(references=result["reference"], predictions=result["prediction"])) diff --git a/examples/AudioQnA/requirements.txt b/examples/AudioQnA/requirements.txt deleted file mode 100644 index c3f6c51a..00000000 --- a/examples/AudioQnA/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -datasets -evaluate -jiwer -librosa -pydub -soundfile -torch -transformers diff --git a/examples/CodeGen/README.md b/examples/CodeGen/README.md deleted file mode 100644 index 5d118967..00000000 --- a/examples/CodeGen/README.md +++ /dev/null @@ -1,92 +0,0 @@ -# CodeGen accuracy Evaluation - -## Evaluation Framework -We evaluate accuracy by [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness). It is a framework for the evaluation of code generation models. - - -## Evaluation FAQs - -### Launch CodeGen microservice -Please refer to [CodeGen Examples](https://github.com/opea-project/GenAIExamples/tree/main/CodeGen), follow the guide to deploy CodeGen megeservice. - -Use cURL command to test codegen service and ensure that it has started properly -```bash -export CODEGEN_ENDPOINT = "http://${your_ip}:7778/v1/codegen" -curl $CODEGEN_ENDPOINT \ - -H "Content-Type: application/json" \ - -d '{"messages": "Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception."}' - -``` - - -### Generation and Evaluation - -For evaluating the models on coding tasks or specifically coding LLMs, we follow the [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness) and provide the command line usage and function call usage. [HumanEval](https://huggingface.co/datasets/openai_humaneval), [HumanEval+](https://huggingface.co/datasets/evalplus/humanevalplus), [InstructHumanEval](https://huggingface.co/datasets/codeparrot/instructhumaneval), [APPS](https://huggingface.co/datasets/codeparrot/apps), [MBPP](https://huggingface.co/datasets/mbpp), [MBPP+](https://huggingface.co/datasets/evalplus/mbppplus), and [DS-1000](https://github.com/HKUNLP/DS-1000/) for both completion (left-to-right) and insertion (FIM) mode are available. -#### command line usage - -```shell -cd evals/evaluation/bigcode_evaluation_harness/examples -python main.py --model Qwen/CodeQwen1.5-7B-Chat \ - --tasks humaneval \ - --codegen_url $CODEGEN_ENDPOINT \ - --max_length_generation 2048 \ - --batch_size 1 \ - --save_generations \ - --save_references \ - --allow_code_execution -``` - -***Note:*** Currently, our framework is designed to execute tasks in full. To ensure the accuracy of results, we advise against using the 'limit' or 'limit_start' parameters to restrict the number of test samples. - - -### accuracy Result -Here is the tested result for your reference -```json -{ - "humaneval": { - "pass@1": 0.7195121951219512 - }, - "config": { - "prefix": "", - "do_sample": true, - "temperature": 0.2, - "top_k": 0, - "top_p": 0.95, - "n_samples": 1, - "eos": "<|endoftext|>", - "seed": 0, - "model": "Qwen/CodeQwen1.5-7B-Chat", - "modeltype": "causal", - "peft_model": null, - "revision": null, - "use_auth_token": false, - "trust_remote_code": false, - "tasks": "humaneval", - "instruction_tokens": null, - "batch_size": 1, - "max_length_generation": 2048, - "precision": "fp32", - "load_in_8bit": false, - "load_in_4bit": false, - "left_padding": false, - "limit": null, - "limit_start": 0, - "save_every_k_tasks": -1, - "postprocess": true, - "allow_code_execution": true, - "generation_only": false, - "load_generations_path": null, - "load_data_path": null, - "metric_output_path": "evaluation_results.json", - "save_generations": true, - "load_generations_intermediate_paths": null, - "save_generations_path": "generations.json", - "save_references": true, - "save_references_path": "references.json", - "prompt": "prompt", - "max_memory_per_gpu": null, - "check_references": false, - "codegen_url": "http://192.168.123.104:31234/v1/codegen" - } -} -``` diff --git a/examples/FaqGen/README.md b/examples/FaqGen/README.md deleted file mode 100644 index 70d66744..00000000 --- a/examples/FaqGen/README.md +++ /dev/null @@ -1,63 +0,0 @@ -# FaqGen Performance Evaluation - -## Dataset -We evaluate performance on QA dataset [Squad_v2](https://huggingface.co/datasets/rajpurkar/squad_v2). Generate FAQs on "context" columns in validation dataset, which contains 1204 unique records. - -First download dataset and put at "./data". - -Extract unique "context" columns, which will be save to 'data/sqv2_context.json': -``` -python get_context.py -``` - -## Generate FAQs - -### Launch FaQGen microservice -Please refer to [FaQGen microservice](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/faq-generation/tgi), set up an microservice endpoint. -``` -export FAQ_ENDPOINT = "http://${your_ip}:9000/v1/faqgen" -``` - -### Generate FAQs with microservice -Use the microservice endpoint to generate FAQs for dataset. -``` -python generate_FAQ.py -``` - -Post-process the output to get the right data, which will be save to 'data/sqv2_faq.json'. -``` -python post_process_FAQ.py -``` - -## Evaluate with Ragas - -### Launch TGI service -We use "mistralai/Mixtral-8x7B-Instruct-v0.1" as LLM referee to evaluate the model. First we need to launch a LLM endpoint on Gaudi. -``` -export HUGGING_FACE_HUB_TOKEN="your_huggingface_token" -bash launch_tgi.sh -``` -Get the endpoint: -``` -export LLM_ENDPOINT = "http://${ip_address}:8082" -``` - -Verify the service: -```bash -curl http://${ip_address}:8082/generate \ - -X POST \ - -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \ - -H 'Content-Type: application/json' -``` - -### Evaluate -evaluate the performance with the LLM: -``` -python evaluate.py -``` - -### Performance Result -Here is the tested result for your reference -| answer_relevancy | faithfulness | context_utilization | reference_free_rubrics_score | -| ---- | ---- |---- |---- | -| 0.7191 | 0.9681 | 0.8964 | 4.4125| diff --git a/examples/FaqGen/evaluate.py b/examples/FaqGen/evaluate.py deleted file mode 100644 index a082d093..00000000 --- a/examples/FaqGen/evaluate.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import json -import os - -from langchain_community.embeddings import HuggingFaceBgeEmbeddings - -from evals.metrics.ragas import RagasMetric - -llm_endpoint = os.getenv("LLM_ENDPOINT", "http://0.0.0.0:8082") - -f = open("data/sqv2_context.json", "r") -sqv2_context = json.load(f) - -f = open("data/sqv2_faq.json", "r") -sqv2_faq = json.load(f) - -templ = """Create a concise FAQs (frequently asked questions and answers) for following text: - TEXT: {text} - Do not use any prefix or suffix to the FAQ. - """ - -number = 1204 -question = [] -answer = [] -ground_truth = ["None"] * number -contexts = [] -for i in range(number): - inputs = sqv2_context[str(i)] - inputs_faq = templ.format_map({"text": inputs}) - actual_output = sqv2_faq[str(i)] - - question.append(inputs_faq) - answer.append(actual_output) - contexts.append([inputs_faq]) - -embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5") -metrics_faq = ["answer_relevancy", "faithfulness", "context_utilization", "reference_free_rubrics_score"] -metric = RagasMetric(threshold=0.5, model=llm_endpoint, embeddings=embeddings, metrics=metrics_faq) - -test_case = {"question": question, "answer": answer, "ground_truth": ground_truth, "contexts": contexts} - -metric.measure(test_case) -print(metric.score) diff --git a/examples/FaqGen/generate_FAQ.py b/examples/FaqGen/generate_FAQ.py deleted file mode 100644 index 2ed70b9e..00000000 --- a/examples/FaqGen/generate_FAQ.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import json -import os -import time - -import requests - -llm_endpoint = os.getenv("FAQ_ENDPOINT", "http://0.0.0.0:9000/v1/faqgen") - -f = open("data/sqv2_context.json", "r") -sqv2_context = json.load(f) - -start_time = time.time() -headers = {"Content-Type": "application/json"} -for i in range(1204): - start_time_tmp = time.time() - print(i) - inputs = sqv2_context[str(i)] - data = {"query": inputs, "max_new_tokens": 128} - response = requests.post(llm_endpoint, json=data, headers=headers) - f = open(f"data/result/sqv2_faq_{i}", "w") - f.write(inputs) - f.write(str(response.content, encoding="utf-8")) - f.close() - print(f"Cost {time.time()-start_time_tmp} seconds") -print(f"\n Finished! \n Totally Cost {time.time()-start_time} seconds\n") diff --git a/examples/FaqGen/get_context.py b/examples/FaqGen/get_context.py deleted file mode 100644 index 8cb73a05..00000000 --- a/examples/FaqGen/get_context.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import json -import os - -import pandas as pd - -data_path = "./data" -data = pd.read_parquet(os.path.join(data_path, "squad_v2/squad_v2/validation-00000-of-00001.parquet")) -sq_context = list(data["context"].unique()) -sq_context_d = dict() -for i in range(len(sq_context)): - sq_context_d[i] = sq_context[i] - -with open(os.path.join(data_path, "sqv2_context.json"), "w") as outfile: - json.dump(sq_context_d, outfile) diff --git a/examples/FaqGen/launch_tgi.sh b/examples/FaqGen/launch_tgi.sh deleted file mode 100644 index b3e04bbb..00000000 --- a/examples/FaqGen/launch_tgi.sh +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -max_input_tokens=3072 -max_total_tokens=4096 -port_number=8082 -model_name="mistralai/Mixtral-8x7B-Instruct-v0.1" -volume="./data" -docker run -it --rm \ - --name="tgi_Mixtral" \ - -p $port_number:80 \ - -v $volume:/data \ - --runtime=habana \ - --restart always \ - -e HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \ - -e HABANA_VISIBLE_DEVICES=all \ - -e OMPI_MCA_btl_vader_single_copy_mechanism=none \ - -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \ - --cap-add=sys_nice \ - --ipc=host \ - -e HTTPS_PROXY=$https_proxy \ - -e HTTP_PROXY=$https_proxy \ - ghcr.io/huggingface/tgi-gaudi:2.0.1 \ - --model-id $model_name \ - --max-input-tokens $max_input_tokens \ - --max-total-tokens $max_total_tokens \ - --sharded true \ - --num-shard 2 diff --git a/examples/FaqGen/post_process_FAQ.py b/examples/FaqGen/post_process_FAQ.py deleted file mode 100644 index 83e6b835..00000000 --- a/examples/FaqGen/post_process_FAQ.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import json - -faq_dict = {} -fails = [] -for i in range(1204): - data = open(f"data/result/sqv2_faq_{i}", "r").readlines() - result = data[-6][6:] - # print(result) - if "LLMChain/final_output" not in result: - print(f"error1: fail for {i}") - fails.append(i) - continue - try: - result2 = json.loads(result) - result3 = result2["ops"][0]["value"]["text"] - faq_dict[str(i)] = result3 - except: - print(f"error2: fail for {i}") - fails.append(i) - continue -with open("data/sqv2_faq.json", "w") as outfile: - json.dump(faq_dict, outfile) -print("Failure index:") -print(fails) From ad58bd8d8ed898b750eb8044b640185d2f032a6b Mon Sep 17 00:00:00 2001 From: aasavari Date: Fri, 20 Sep 2024 02:43:05 +0000 Subject: [PATCH 04/32] minor naming correction to maintain consistency Signed-off-by: aasavari --- evals/metrics/ragas/ragas.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py index 672ac39e..2acd86d8 100644 --- a/evals/metrics/ragas/ragas.py +++ b/evals/metrics/ragas/ragas.py @@ -10,9 +10,6 @@ from langchain_core.embeddings import Embeddings from langchain_core.language_models import BaseLanguageModel -import sys -sys.path.append('/home/akakne/miniforge3/envs/recsys/bin') - def format_ragas_metric_name(name: str): return f"{name} (ragas)" @@ -77,14 +74,14 @@ def measure(self, test_case: Dict): print("OPENAI_API_KEY is provided, ragas initializes the model by OpenAI.") self.model = None if isinstance(self.model, str): - print("Loading a HuggingFace Endpoint") - chat_model = HuggingFaceEndpoint( + print("LLM endpoint: ", self.model) + self.chat_model = HuggingFaceEndpoint( endpoint_url=self.model, timeout=600, ) else: print("Accepting user-initialized model as we could not detect OpenAI key or HuggingFace Endpoint URL.") - chat_model = self.model + self.chat_model = self.model # Create a dataset from the test case # Convert the Dict to a format compatible with Dataset if self.metrics is not None: @@ -133,7 +130,7 @@ def measure(self, test_case: Dict): self.score = evaluate( dataset, metrics=self.metrics, - llm=chat_model, + llm=self.chat_model, embeddings=self.embeddings, ) return self.score From c49ea8406031dd59cc7a921a33343fc75690cc53 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Sep 2024 02:46:19 +0000 Subject: [PATCH 05/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: aasavari --- evals/metrics/ragas/ragas.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py index 2acd86d8..da093e2c 100644 --- a/evals/metrics/ragas/ragas.py +++ b/evals/metrics/ragas/ragas.py @@ -1,4 +1,3 @@ - #!/usr/bin/env python # -*- coding: utf-8 -*- # Copyright (C) 2024 Intel Corporation @@ -6,15 +5,19 @@ # import os from typing import Dict, Optional, Union -from langchain_huggingface import HuggingFaceEndpoint + from langchain_core.embeddings import Embeddings from langchain_core.language_models import BaseLanguageModel +from langchain_huggingface import HuggingFaceEndpoint + def format_ragas_metric_name(name: str): return f"{name} (ragas)" + class RagasMetric: """This metric checks if the output is more than 3 letters.""" + def __init__( self, threshold: float = 0.3, @@ -36,13 +39,15 @@ def __init__( "context_utilization", # "reference_free_rubrics_score", ] + async def a_measure(self, test_case: Dict): return self.measure(test_case) + def measure(self, test_case: Dict): # sends to server try: from ragas import evaluate - from ragas.metrics import ( + from ragas.metrics import ( # reference_free_rubrics_score, answer_correctness, answer_relevancy, answer_similarity, @@ -50,7 +55,6 @@ def measure(self, test_case: Dict): context_recall, context_utilization, faithfulness, - # reference_free_rubrics_score, ) except ModuleNotFoundError: raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.") @@ -117,13 +121,13 @@ def measure(self, test_case: Dict): for column in list(metric._required_columns.values())[0]: _required_columns.add(column) column2field = { - "user_input" : "question", - "response" : "answer", - "reference" : "ground_truth", - "retrieved_contexts" : "contexts" + "user_input": "question", + "response": "answer", + "reference": "ground_truth", + "retrieved_contexts": "contexts", } _required_fields = [column2field[column] for column in _required_columns] - data = {field : test_case[field] for field in _required_fields} + data = {field: test_case[field] for field in _required_fields} dataset = Dataset.from_dict(data) # evaluate @@ -134,8 +138,10 @@ def measure(self, test_case: Dict): embeddings=self.embeddings, ) return self.score + def is_successful(self): return self.success + @property def __name__(self): return "RAGAS" From 50d41670266883343b5e3ca56b461f6077b67598 Mon Sep 17 00:00:00 2001 From: ZePan110 Date: Fri, 20 Sep 2024 10:51:32 +0800 Subject: [PATCH 06/32] Add hyperlinks and paths validation. (#132) Signed-off-by: ZePan110 Signed-off-by: aasavari --- .github/workflows/pr-path-detection.yml | 123 ++++++++++++++++++ README.md | 2 +- doc/platform-optimization/README.md | 2 +- evals/evaluation/autorag/evaluation/README.md | 4 +- evals/evaluation/rag_eval/README.md | 2 +- examples/AudioQnA/README.md | 48 +++++++ 6 files changed, 176 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/pr-path-detection.yml create mode 100644 examples/AudioQnA/README.md diff --git a/.github/workflows/pr-path-detection.yml b/.github/workflows/pr-path-detection.yml new file mode 100644 index 00000000..2bfb3969 --- /dev/null +++ b/.github/workflows/pr-path-detection.yml @@ -0,0 +1,123 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +name: Check Paths and Hyperlinks + +on: + pull_request: + branches: [main] + types: [opened, reopened, ready_for_review, synchronize] + +jobs: + check-the-validity-of-hyperlinks-in-README: + runs-on: ubuntu-latest + steps: + - name: Clean Up Working Directory + run: sudo rm -rf ${{github.workspace}}/* + + - name: Checkout Repo GenAIEval + uses: actions/checkout@v4 + + - name: Check the Validity of Hyperlinks + run: | + cd ${{github.workspace}} + fail="FALSE" + url_lines=$(grep -Eo '\]\(http[s]?://[^)]+\)' --include='*.md' -r .|grep -Ev 'GenAIEval/blob/main') + if [ -n "$url_lines" ]; then + for url_line in $url_lines; do + url=$(echo "$url_line"|cut -d '(' -f2 | cut -d ')' -f1|sed 's/\.git$//') + path=$(echo "$url_line"|cut -d':' -f1 | cut -d'/' -f2-) + response=$(curl -L -s -o /dev/null -w "%{http_code}" "$url") + if [ "$response" -ne 200 ]; then + echo "**********Validation failed, try again**********" + response_retry=$(curl -s -o /dev/null -w "%{http_code}" "$url") + if [ "$response_retry" -eq 200 ]; then + echo "*****Retry successfully*****" + else + echo "Invalid link from ${{github.workspace}}/$path: $url" + fail="TRUE" + fi + fi + done + fi + + if [[ "$fail" == "TRUE" ]]; then + exit 1 + else + echo "All hyperlinks are valid." + fi + shell: bash + + check-the-validity-of-relative-path: + runs-on: ubuntu-latest + steps: + - name: Clean up Working Directory + run: sudo rm -rf ${{github.workspace}}/* + + - name: Checkout Repo GenAIEval + uses: actions/checkout@v4 + + - name: Checking Relative Path Validity + run: | + cd ${{github.workspace}} + fail="FALSE" + repo_name=${{ github.event.pull_request.head.repo.full_name }} + if [ "$(echo "$repo_name"|cut -d'/' -f1)" != "opea-project" ]; then + owner=$(echo "${{ github.event.pull_request.head.repo.full_name }}" |cut -d'/' -f1) + branch="https://github.com/$owner/GenAIEval/tree/${{ github.event.pull_request.head.ref }}" + else + branch="https://github.com/opea-project/GenAIEval/blob/${{ github.event.pull_request.head.ref }}" + fi + link_head="https://github.com/opea-project/GenAIEval/blob/main" + png_lines=$(grep -Eo '\]\([^)]+\)' --include='*.md' -r .|grep -Ev 'http') + if [ -n "$png_lines" ]; then + for png_line in $png_lines; do + refer_path=$(echo "$png_line"|cut -d':' -f1 | cut -d'/' -f2-) + png_path=$(echo "$png_line"|cut -d '(' -f2 | cut -d ')' -f1) + if [[ "${png_path:0:1}" == "/" ]]; then + check_path=${{github.workspace}}$png_path + elif [[ "${png_path:0:1}" == "#" ]]; then + check_path=${{github.workspace}}/$refer_path$png_path + else + check_path=${{github.workspace}}/$(dirname "$refer_path")/$png_path + fi + real_path=$(realpath $check_path) + if [ $? -ne 0 ]; then + echo "Path $png_path in file ${{github.workspace}}/$refer_path does not exist" + fail="TRUE" + else + url=$link_head$(echo "$real_path" | sed 's|.*/GenAIEval||') + response=$(curl -I -L -s -o /dev/null -w "%{http_code}" "$url") + if [ "$response" -ne 200 ]; then + echo "**********Validation failed, try again**********" + response_retry=$(curl -s -o /dev/null -w "%{http_code}" "$url") + if [ "$response_retry" -eq 200 ]; then + echo "*****Retry successfully*****" + else + echo "Retry failed. Check branch ${{ github.event.pull_request.head.ref }}" + url_dev=$branch$(echo "$real_path" | sed 's|.*/GenAIEval||') + response=$(curl -I -L -s -o /dev/null -w "%{http_code}" "$url_dev") + if [ "$response" -ne 200 ]; then + echo "**********Validation failed, try again**********" + response_retry=$(curl -s -o /dev/null -w "%{http_code}" "$url_dev") + if [ "$response_retry" -eq 200 ]; then + echo "*****Retry successfully*****" + else + echo "Invalid path from ${{github.workspace}}/$refer_path: $png_path" + fail="TRUE" + fi + else + echo "Check branch ${{ github.event.pull_request.head.ref }} successfully." + fi + fi + fi + fi + done + fi + + if [[ "$fail" == "TRUE" ]]; then + exit 1 + else + echo "All hyperlinks are valid." + fi + shell: bash diff --git a/README.md b/README.md index 8734f83a..3d6b6d6e 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ results = evaluate(args) #### remote service usage -1. setup a separate server with [GenAIComps](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/lm-eval) +1. setup a separate server with [GenAIComps](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/utils/lm-eval) ``` # build cpu docker diff --git a/doc/platform-optimization/README.md b/doc/platform-optimization/README.md index ae74765d..8b98a21c 100644 --- a/doc/platform-optimization/README.md +++ b/doc/platform-optimization/README.md @@ -98,7 +98,7 @@ Let us consider isolating AI inference and reranking containers in application's Gaudi accelerated pipeline. In the -[manifest](https://github.com/opea-project/GenAIExamples/blob/main/ChatQnA/kubernetes/manifests/gaudi/chatqna.yaml) +[manifest](https://github.com/opea-project/GenAIExamples/blob/main/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna.yaml) there are "tgi", "tei" and "teirerank" containers in "chatqna-tgi" and "chatqna-tei" and "chatqna-teirerank" deployments that will need a lot of CPUs. They implement text-generation-interface and diff --git a/evals/evaluation/autorag/evaluation/README.md b/evals/evaluation/autorag/evaluation/README.md index 8068d58b..99a623d1 100644 --- a/evals/evaluation/autorag/evaluation/README.md +++ b/evals/evaluation/autorag/evaluation/README.md @@ -1,6 +1,6 @@ # AutoRAG to evaluate the RAG system performance -AutoRAG is help to end-to-end evaluate the performance of the whole system. Currently, we support to evaluate the performance from 4 perspectives, answer_relevancy, faithfulness, context_recall, context_precision. Before using this service, the use should firstly prepare the groundtruth dataset in the [standard format](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/ground_truth.jsonl). We also provide a [script](https://github.com/opea-project/GenAIEval/blob/main/evals/evaluation/autorag/data_generation/gen_eval_dataset.py) to automatically generate the groundtruth query and answer. +AutoRAG is help to end-to-end evaluate the performance of the whole system. Currently, we support to evaluate the performance from 4 perspectives, answer_relevancy, faithfulness, context_recall, context_precision. Before using this service, the use should firstly prepare the groundtruth dataset in the [standard format](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/ragas/ground_truth.jsonl). We also provide a [script](https://github.com/opea-project/GenAIEval/blob/main/evals/evaluation/autorag/data_generation/gen_eval_dataset.py) to automatically generate the groundtruth query and answer. ## Service preparation The evaluation for the RAG system is based on the set up of the RAG services. Please follow [the steps](https://github.com/opea-project/GenAIExamples/blob/main/ChatQnA/README.md) to set up your RAG services. @@ -12,7 +12,7 @@ At this moment, we provide a solution that test the single group of parameters a python -u ragas_evaluation_benchmark.py --ground_truth_file ground_truth.jsonl --search_type mmr --k 1 --fetch_k 5 --score_threshold 0.3 --top_n 1 --temperature 0.01 --top_k 5 --top_p 0.95 --repetition_penalty 1.1 --use_openai_key True ``` -For evaluating multiple groups of parameters, please use [this script](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/run_rag_benchmark.py). +For evaluating multiple groups of parameters, please use [this script](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/ragas/run_rag_benchmark.py). ```bash python -u run_rag_benchmark.py --config config.yaml ``` diff --git a/evals/evaluation/rag_eval/README.md b/evals/evaluation/rag_eval/README.md index 59f7dd2f..1186464a 100644 --- a/evals/evaluation/rag_eval/README.md +++ b/evals/evaluation/rag_eval/README.md @@ -7,7 +7,7 @@ - [Prerequisites](#prerequisites) - [MultiHop (English dataset)](#multihop) - [Launch Service of RAG System](#launch-service-of-rag-system) - - [Launch Service of LLM-as-a-Judge](launch-service-of-llm) + - [Launch Service of LLM-as-a-Judge](#launch-service-of-llm-as-a-judge) - [Prepare Dataset](#prepare-dataset) - [Evaluation](#evaluation) - [CRUD (Chinese dataset)](#crud) diff --git a/examples/AudioQnA/README.md b/examples/AudioQnA/README.md new file mode 100644 index 00000000..45290620 --- /dev/null +++ b/examples/AudioQnA/README.md @@ -0,0 +1,48 @@ +# AudioQnA accuracy Evaluation + +## Dataset + + +We evaluate the ASR accuracy on the test set of librispeech [dataset](https://huggingface.co/datasets/andreagasparini/librispeech_test_only), which contains 2620 records of audio and texts. + +## Metrics + +We evaluate the WER (Word Error Rate) metric of the ASR microservice. + +## Evaluation + +### Launch ASR microservice + +Launch the ASR microserice with the following commands. For more details please refer to [doc](https://github.com/opea-project/GenAIComps/tree/main/comps/asr). + +```bash +git clone https://github.com/opea-project/GenAIComps +cd GenAIComps +docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile . +# change the name of model by editing model_name_or_path you want to evaluate +docker run -p 7066:7066 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest --model_name_or_path "openai/whisper-tiny" +``` + +### Evaluate + +Install dependencies: + +``` +pip install -r requirements.txt +``` + +Evaluate the performance with the LLM: +```py +# validate the offline model +# python offline_evaluate.py +# validate the online asr microservice accuracy +python online_evaluate.py +``` + +### Performance Result +Here is the tested result for your reference +|| WER | +| --- | ---- | +|whisper-large-v2| 2.87| +|whisper-large| 2.7 | +|whisper-medium| 3.45 | From bafc7013ee6fec19fdc1457b32b085c76cc08cdb Mon Sep 17 00:00:00 2001 From: aasavari Date: Fri, 20 Sep 2024 04:48:25 +0000 Subject: [PATCH 07/32] adding README for OPEA ragas Signed-off-by: aasavari --- evals/metrics/ragas/README.md | 40 +++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 evals/metrics/ragas/README.md diff --git a/evals/metrics/ragas/README.md b/evals/metrics/ragas/README.md new file mode 100644 index 00000000..0736b265 --- /dev/null +++ b/evals/metrics/ragas/README.md @@ -0,0 +1,40 @@ +# OPEA adaption of ragas (LLM-as-a-judge evaluation of Retrieval Augmented Generation) +OPEA's adaption of [ragas](https://github.com/explodinggradients/ragas) allows you to use [ragas](https://github.com/explodinggradients/ragas) on Intel's Gaudi AI accelarator chips. + +## User data +Please wrap your input data in `datasets.Dataset` class. +``` +from datasets import Dataset +example = { + "question" : "Who is wife of Barak Obama", + "contexts" : [ + "Michelle Obama, wife of Barak Obama (former President of the United States of America) is an attorney", + "Barak and Michelle Obama have 2 daughters - Malia and Sasha" + ], + "answer" : "Michelle Obama", + "ground_truth" : "Wife of Barak Obama is Michelle Obama" +} +dataset = Dataset.from_list([example]) +``` + +## Launch HuggingFace endpoint on Intel's Gaudi machines +Please follow instructions mentioned in [TGI Gaudi repo](https://github.com/huggingface/tgi-gaudi) with your desired LLM such as `meta-llama/Meta-Llama-3.1-70B-Instruct`. + +## Run OPEA ragas pipeline using your desired list of metrics +``` +# note - if you wish to use answer relevancy metric, please set the embedding parameter +from langchain_community.embeddings import HuggingFaceBgeEmbeddings +embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5") + +from ragas import RagasMetric +ragas_metric = RagasMetric( + threshold=0.5, + model="", + embeddings=embeddings + ) +print(ragas_metric.measure(dataset)) +``` +That's it! + +## Troubleshooting +Please allow few minutes for HuggingFace endpoint to download model weights and load them. Larger models may take few more minutes. For any other issue, please file an issue and we will get back to you. \ No newline at end of file From 1cc5ffe673f263b3d08585830bfc98ac7c8d3c5f Mon Sep 17 00:00:00 2001 From: aasavari Date: Fri, 20 Sep 2024 04:53:01 +0000 Subject: [PATCH 08/32] adding python3 syntax to README Signed-off-by: aasavari --- evals/metrics/ragas/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evals/metrics/ragas/README.md b/evals/metrics/ragas/README.md index 0736b265..c5ecd176 100644 --- a/evals/metrics/ragas/README.md +++ b/evals/metrics/ragas/README.md @@ -3,7 +3,7 @@ OPEA's adaption of [ragas](https://github.com/explodinggradients/ragas) allows y ## User data Please wrap your input data in `datasets.Dataset` class. -``` +```python3 from datasets import Dataset example = { "question" : "Who is wife of Barak Obama", @@ -21,7 +21,7 @@ dataset = Dataset.from_list([example]) Please follow instructions mentioned in [TGI Gaudi repo](https://github.com/huggingface/tgi-gaudi) with your desired LLM such as `meta-llama/Meta-Llama-3.1-70B-Instruct`. ## Run OPEA ragas pipeline using your desired list of metrics -``` +```python3 # note - if you wish to use answer relevancy metric, please set the embedding parameter from langchain_community.embeddings import HuggingFaceBgeEmbeddings embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5") From 3390b5d6b9cdbff133c8af6daf39d27e3e38e388 Mon Sep 17 00:00:00 2001 From: aasavari Date: Tue, 24 Sep 2024 02:44:07 +0000 Subject: [PATCH 09/32] adding auto (annotation-free) evaluation - functionality Signed-off-by: aasavari --- evals/evaluation/auto_eval/.env | 1 + evals/evaluation/auto_eval/README.md | 27 ++++ .../correctness_prompt.md | 6 + .../factualness_prompt.md | 6 + .../readability_prompt.md | 6 + .../relevance_prompt.md | 6 + .../auto_eval_metrics/opening_prompt.md | 13 ++ evals/evaluation/auto_eval/dataset.py | 88 +++++++++++ .../auto_eval/prompt_engineering.py | 89 +++++++++++ evals/evaluation/auto_eval/run_eval.py | 146 ++++++++++++++++++ evals/evaluation/auto_eval/utils/__init__.py | 4 + evals/evaluation/auto_eval/utils/helper.py | 82 ++++++++++ evals/evaluation/auto_eval/utils/model.py | 70 +++++++++ evals/evaluation/auto_eval/utils/retry.py | 47 ++++++ 14 files changed, 591 insertions(+) create mode 100644 evals/evaluation/auto_eval/.env create mode 100644 evals/evaluation/auto_eval/README.md create mode 100644 evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/correctness_prompt.md create mode 100644 evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/factualness_prompt.md create mode 100644 evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/readability_prompt.md create mode 100644 evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/relevance_prompt.md create mode 100644 evals/evaluation/auto_eval/auto_eval_metrics/opening_prompt.md create mode 100644 evals/evaluation/auto_eval/dataset.py create mode 100644 evals/evaluation/auto_eval/prompt_engineering.py create mode 100644 evals/evaluation/auto_eval/run_eval.py create mode 100644 evals/evaluation/auto_eval/utils/__init__.py create mode 100644 evals/evaluation/auto_eval/utils/helper.py create mode 100644 evals/evaluation/auto_eval/utils/model.py create mode 100644 evals/evaluation/auto_eval/utils/retry.py diff --git a/evals/evaluation/auto_eval/.env b/evals/evaluation/auto_eval/.env new file mode 100644 index 00000000..780dfc14 --- /dev/null +++ b/evals/evaluation/auto_eval/.env @@ -0,0 +1 @@ +OPENAI_KEY=xxx \ No newline at end of file diff --git a/evals/evaluation/auto_eval/README.md b/evals/evaluation/auto_eval/README.md new file mode 100644 index 00000000..64080f2e --- /dev/null +++ b/evals/evaluation/auto_eval/README.md @@ -0,0 +1,27 @@ +# Auto (annotation-free) Evaluation of Retrieval Augmented Generation + +We provide easy-to-use, flexible and annotation-free RAG evaluation tool using LLM-as-a-judge while benefitting from Intel's Gaudi2 AI accelator chips. + +## Overview +### Data +AutoEval is best suited for Long Form Question Answering (LFQA) datasets where you want to gauge quality and factualness of the answer via LLM's intelligence. Here, you can use benchmarking datasets or bring your own custom datasets. Please make sure to set `field_map` to map AutoEval fields such as "question" to your dataset's corresponding field like "query". +> Note : To use benchmarking datasets, set argument `data_mode=benchmarking`. Similarly, to use custom datasets, set `data_mode=local`. +### Model +AutoEval can run in 3 evaluation modes - +1. `evaluation_mode=endpoint` uses HuggingFace endpoint. +- We recommend launching a HuggingFace endpoint on Gaudi AI accelerator machines to ensure maximum usage and performance. +- To launch HF endpoint on Gaudi2, please follow the 2-step instructions here - [tgi-gaudi](https://github.com/huggingface/tgi-gaudi). +- Pass your endpoint url as `model_name` argument. +2. `evaluation_mode=openai` uses openai backend. +- Please set your `OPEN_API_KEY` and your choice of model as `model_name` argument. +3. `evaluation_mode=local` uses your local hardware. +- Set `hf_token` argument and set your favourite open-source model in `model_name` argument. +- GPU usage will be prioritized after checking it's availability. Otherwise the model will run on CPU. +## Metrics +AutoEval provides 4 metrics - factualness, correctness, relevance and readability. You can also add your own metrics and your own grading scales. Don't forget to add your metric to `evaluation_metrics` argument. +## Generation configuration +Please set generation parameters as per your requirement in `GENERATION_CONFIG` in `run_eval.py`. +## Run +```bash +python3 run_eval.py --log_path="./exp1.log" +``` \ No newline at end of file diff --git a/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/correctness_prompt.md b/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/correctness_prompt.md new file mode 100644 index 00000000..55f1db85 --- /dev/null +++ b/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/correctness_prompt.md @@ -0,0 +1,6 @@ +- Correctness: correctness measures how accurately and comprehensively does the answer resolve problem posed in the question. + - Score 1: If the answer is empty string or something like "I do not know the answer", the correctness score is 1. + - Score 2: If the answer only addresses a small part of the question correctly or it is missing many critical steps/aspects of the answer or the answer is too short to fully answer the question or is missing many steps causing the answer to not fully address the problem described in the question, then the correctness score is 2. + - Score 3: The answer mostly addresses the question but one critical aspect/step is missing or is incorrect. + - Score 4: the answer mostly answer the question and covers all critical/main aspects of the question, but itโ€™s missing important/neccessary details about one or more aspects. + - Score 5: the answer correctly and completely addresses the query. It also covers important details about each step. \ No newline at end of file diff --git a/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/factualness_prompt.md b/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/factualness_prompt.md new file mode 100644 index 00000000..0289723a --- /dev/null +++ b/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/factualness_prompt.md @@ -0,0 +1,6 @@ +- Factualness: Factualness assesses how much of the provided answer is contained within the provided context. A higher score indicates that a higher proportion of claims present in the answer are present or can be derived from the provided context. + - Score 1: the answer is completely hallucinated i.e. not contained in the context at all or there is no answer. + - Score 2: only a small part of the answer is contained in the context but most of it is imaginary/hallucinated or the meaning is completely changed from what is represented in the context. + - Score 3: Only about half of the answer is contained in the context. Rest of the answer is hallucinated or imaginary. + - Score 4: Most of the claims in the answer can be inferred from the provided context with very little information that is not directly supported by the provided context. + - Score 5: All of the claims in the answer are directly supported by the provided context, demonstrating high faithfulness to the provided context. \ No newline at end of file diff --git a/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/readability_prompt.md b/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/readability_prompt.md new file mode 100644 index 00000000..1ddd1345 --- /dev/null +++ b/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/readability_prompt.md @@ -0,0 +1,6 @@ +- Readability: Readability measures clarity and lucidity of the answer. Readability is measured solely based on the answer and it does not consider the question or the context. + - Score 1: the answer is empty or "I do not know the answer" or completely unreadable or No meaningful information can be extracted from the answer, then the score is 1. + - Score 2: the answer is slightly readable, there are irrelevant symbols or HTML tags or repeated words, but it can roughly form a meaningful sentence that can cover some aspects of the answer. + - Score 3: Answer can be read but there are grammatical mistakes in the answer. + - Score 4: the answer readable, but the readability and style can improved to better appeal to the reader. + - Score 5: the answer is reader friendly and well written. \ No newline at end of file diff --git a/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/relevance_prompt.md b/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/relevance_prompt.md new file mode 100644 index 00000000..83f8e248 --- /dev/null +++ b/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/relevance_prompt.md @@ -0,0 +1,6 @@ +- Relevance: Relevance measures how well the answer relates to the question. + - Score 1: The answer doesn't mention anything about the question or is completely irrelevant to the question. + - Score 2: The answer only identifies the domain (e.g. cnvrg) mentioned in the question and provides information from the correct domain. But, the answer does not address the question itself and the point of the question is completely missed by it. + - Score 3: The answer correctly identifies the domain and essence of the question but the details in the answer are not relevant to the focus of the question. + - Score 4: The answer correctly identifies domain mentioned the question and essence of the qustion as well as stays consistent with both of them. But there is some part of the answer that is not relevant to the question or it's topic or it's essence. This irrelvant part is damaging the overall relevance of the answer. + - Score 5: The answer is completely relevant to the question and the details do not deviate from the essence of the question. There are no parts of the answer that are irrelevant or unneccessary for the given question. \ No newline at end of file diff --git a/evals/evaluation/auto_eval/auto_eval_metrics/opening_prompt.md b/evals/evaluation/auto_eval/auto_eval_metrics/opening_prompt.md new file mode 100644 index 00000000..bc938aa8 --- /dev/null +++ b/evals/evaluation/auto_eval/auto_eval_metrics/opening_prompt.md @@ -0,0 +1,13 @@ +Consider yourself as an engineer working at cnvrg.io which is a Full Stack Machine Learning Operating System owned by Intel. + +Your task: +You will be given an input consisting of a question, an answer and a context. Your task is to act as an impartial judge and provide a numerical score between 1 to 5 for each of the following metrics for the given answer. + +Important rules for you while completing this task: +1. You MUST ALWAYS provide a score for every metric mentioned below. +2. Make sure to understand definition of every metric fully before completing your task. Every metric is provided with grading scale and rubric. You MUST use this grading scale and rubric to determine your score. +3. Ensure that your scores and reasoning for every metric is indepedent of each other e.g., score for factualness should not impact score for corectness and vice versa. +4. Base your grading decision only on the given inputs and do not speculate or hallucinate. +5. You must also provide reasoning for your score in a single sentence. + +Your metric definitions along with grading scale and rubric: \ No newline at end of file diff --git a/evals/evaluation/auto_eval/dataset.py b/evals/evaluation/auto_eval/dataset.py new file mode 100644 index 00000000..6dc99e42 --- /dev/null +++ b/evals/evaluation/auto_eval/dataset.py @@ -0,0 +1,88 @@ +from datasets import Dataset, load_dataset +import jsonlines +import os + +class RAGDataset: + + """ + Dataset class to store data in HF datasets API format + """ + + def __init__(self, dataset, field_map, mode): + self.dataset = dataset + self.field_map = field_map + assert mode in ["local", "benchmarking"], "mode can be either local or benchmarking" + self.mode = mode + self.data = self.load_data() + self.validate_dataset() + + def load_data(self): + if self.mode == "local": + assert os.path.exists(self.dataset), "There is no such file - {}".format(self.dataset) + with jsonlines.open(self.dataset) as reader: + data = [] + for obj in reader: + ex = {} + for out_field, in_field in self.field_map.items(): + if type(obj[in_field]) == list: + ex[out_field] = '\n'.join(obj[in_field]) + else: + ex[out_field] = obj[in_field] + data.append(ex) + return Dataset.from_list(data) + else: + data = [] + for obj in load_dataset(self.dataset)['train']: + ex = {} + for out_field, in_field in self.field_map.items(): + if type(obj[in_field]) == list: + ex[out_field] = '\n'.join(obj[in_field]) + else: + ex[out_field] = obj[in_field] + data.append(ex) + return Dataset.from_list(data) + + def validate_dataset(self): + for i, example in enumerate(self.data): + for out_field in self.field_map: + assert out_field in example, "Example {} does not have {} field".format(i + 1, out_field) + + def __getitem__(self, index): + return self.data[index] + + def __len__(self): + return len(self.data) + + def __iter__(self): + return iter(self.data) + +if __name__ == "__main__": + + dataset_path = '../../benchmark/ragas/ground_truth.jsonl' + field_map = { + 'question' : 'question', + 'ground_truth' : 'ground_truth', + 'context' : 'context', + } + + ds = RAGDataset(dataset=dataset_path, + field_map=field_map, + mode="local") + + for i, ex in enumerate(ds): + assert ex['question'] == ds[i]['question'], "index {} does not have correct query".format(i) + + dataset = "explodinggradients/ragas-wikiqa" + field_map = { + 'question' : 'question', + 'answer' : 'generated_with_rag', + 'context' : 'context', + 'ground_truth' : 'correct_answer' + } + ds = RAGDataset(dataset=dataset, field_map=field_map, mode="benchmarking") + + for i, ex in enumerate(ds): + assert ex['question'] == ds[i]['question'], "index {} does not have correct query".format(i) + + + diff --git a/evals/evaluation/auto_eval/prompt_engineering.py b/evals/evaluation/auto_eval/prompt_engineering.py new file mode 100644 index 00000000..8e2987c2 --- /dev/null +++ b/evals/evaluation/auto_eval/prompt_engineering.py @@ -0,0 +1,89 @@ +from jinja2 import Environment, FileSystemLoader, Template +import os +from dotenv import load_dotenv + +class Prompt: + + """class to customize prompt template using user-defined list of metrics""" + + def __init__(self, metrics, input_fields, prompt_dir): + self.metrics = metrics + self.input_fields = input_fields + self.define_template_paths(prompt_dir) + self.template = self.load_prompt_template() + + def define_template_paths(self, prompt_dir): + self.opening_prompt_path = os.path.join(prompt_dir, "opening_prompt.md") + metric_prompt_names = ["{}_prompt.md".format(metric) for metric in self.metrics] + local_metric_prompt_paths = [os.path.join("metric_prompt_templates", m) for m in metric_prompt_names] + self.metric_prompt_paths = [os.path.join(prompt_dir, p) for p in local_metric_prompt_paths] + + def create_grading_format(self): + grading_format = "You must ALWAYS provide every single one of the scores and reasonings in the following JSON format:" + grading_format += "\n" + "{" + "\n" + content = [] + reasoning_prompt = "Reasoning for {}: [your one line step by step reasoning about the {} of the answer]" + scoring_prompt = "Score for {}: [your score number for the {} of the answer]" + for metric in self.metrics: + reasoning = reasoning_prompt.format(metric, metric) + score = scoring_prompt.format(metric, metric) + content += reasoning + "\n" + score, + grading_format += "\n\n".join(content) + grading_format += "\n" + "}" + return grading_format + + def create_closing_prompt(self): + closing_prompt = ["Let's begin!"] + for f in self.input_fields: + closing_prompt += "Provided {}:".format(f) + '\n' + "{{" + f + "}}", + return '\n\n'.join(closing_prompt) + + @staticmethod + def load_template(template_path): + dir = os.path.dirname(os.path.abspath(__file__)) + env = Environment(loader=FileSystemLoader(dir)) + return env.get_template(template_path) + + def load_prompt_template(self): + content = [self.load_template(self.opening_prompt_path).render()] + for path in self.metric_prompt_paths: + content += self.load_template(path).render(), + content += self.create_grading_format(), + content += self.create_closing_prompt(), + return Template('\n\n'.join(content)) + + def render_prompt(self, **kwargs) -> str: + text = self.template.render(**kwargs) + return text + + +if __name__ == "__main__": + + """Here, we test implementation of Prompt class""" + + # step 0 - user input + metrics = ['factualness', 'relevance', 'correctness', 'readability'] + input_fields = ['question', 'answer', 'context'] + prompt_dir = './auto_eval_metrics/' + + # step 1 - load jinja2 environment + load_dotenv(os.path.join(os.path.dirname(__file__), ".env"), override=True) + + # step 2 - load prompt using Prompt class + prompt = Prompt(metrics=metrics, input_fields=input_fields, prompt_dir=prompt_dir) + + example = { + "question": "Who is wife of Barak Obama", + "context": "Michelle Obama, wife of Barak Obama (former President of the United States of America) is an attorney. Barak and Michelle Obama have 2 daughters - Malia and Sasha", + "answer": "Michelle Obama", + "ground_truth": "Wife of Barak Obama is Michelle Obama", + } + + rendered_prompt = prompt.render_prompt( + question=example['question'], + answer=example['answer'], + context=example['context']) + + print(rendered_prompt) + + diff --git a/evals/evaluation/auto_eval/run_eval.py b/evals/evaluation/auto_eval/run_eval.py new file mode 100644 index 00000000..882846bc --- /dev/null +++ b/evals/evaluation/auto_eval/run_eval.py @@ -0,0 +1,146 @@ +import argparse +import ast +from dataset import RAGDataset +from dotenv import load_dotenv +from huggingface_hub import login +from jinja2 import Environment, FileSystemLoader +import json +import os +import pandas as pd +from prompt_engineering import Prompt +import time +from utils.model import * +from utils.helper import * + +GENERATION_CONFIG = { + "openai" : {"temperature" : 0.1}, + "endpoint" : {"max_new_tokens" : 500}, + "local" : {"max_new_tokens" : 500} +} + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--input_data", type=str, + default="explodinggradients/ragas-wikiqa", + help="path of the input data" + ) + parser.add_argument( + "--data_mode", type=str, + default="benchmarking", + help="mode of data can be local or benchmarking" + ) + parser.add_argument( + "--field_map", type=dict, + default={'question' : 'question','answer' : 'generated_with_rag','context' : 'context'}, + help="field map that will be used while loading the dataset" + ) + parser.add_argument( + "--template_dir", type=str, default="auto_eval_metrics", + help="path to dir of prompt templates" + ) + parser.add_argument( + "--hf_token", type=str, default="", + help="Please provide your HF token" + ) + parser.add_argument( + "--openai_key", type=str, + default="add your OpenAI token", + help="please provide your OpenAI key" + ) + parser.add_argument( + "--evaluation_mode", type=str, + default="openai", + help="evaluation mode can be openai / endpoint / local" + ) + parser.add_argument( + "--model_name", type=str, + default="gpt-4o", + help="the model to be used for evaluation" + ) + parser.add_argument( + "--evaluation_metrics", type=list, + default=["factualness", "relevance", "correctness", "readability"], + help="metrics to be used for evaluation of RAG" + ) + parser.add_argument( + "--log_path", type=str, default="./exp1.log", + help="path of the log file" + ) + args = parser.parse_args() + return args + + +def load_template(template_path): + template = Environment( + loader=FileSystemLoader(os.path.dirname(os.path.abspath(__file__))) + ).get_template(template_path) + return template + + +def generate(evaluator, data, template, generation_config, args): + responses = [] + for sample in data: + print(sample['question']) + prompt = render_prompt(template, query=sample['question'], answer=sample['answer'], context=sample['context']) + messages = [{"role": "user", "content": prompt}] + response = evaluator.generate(messages, **generation_config) + print(response) + responses.append(response) + print("-"*100) + break + return responses + + +def log_responses(responses, args): + sep = '\n' + '-'*100 + '\n' + text = sep.join(responses) + with open(args.log_path, 'w') as f: + f.write(text) + + +if __name__ == "__main__": + + very_start = time.time() + + # step 1 : load dot environment + dot_env_path = os.path.join(os.path.dirname(__file__), ".env") + print("Loading dot environment from {}".format(dot_env_path)) + load_dotenv(dot_env_path, override=True) + + # step 2 : validate and load input args + args = get_args() + + # step 3 : load dataset + data = RAGDataset(dataset=args.input_data, + field_map=args.field_map, + mode=args.data_mode) + + # step 4 : load LLM + if args.evaluation_mode == "openai": + # assert args.model_name in ALLOWED_OPENAI_MODELS, "please provide a openai model from the given list of allowed models" + print("Using {} openai key".format(args.openai_key)) + evaluator = OAIEvaluator(args.openai_key, args.model_name) + elif args.evaluation_mode == "endpoint": + print("Loading HF endpoint at {}".format(args.model_name)) + evaluator = EndpointEvaluator(args.model_name) + else: + assert args.evaluation_mode == "local", "evaluation mode must be openai / endpoint / local" + print("Loading {} model locally".format(args.model_name)) + login(token=args.hf_token) + evaluator = HFEvaluator(args.model_name) + + # step 5 : load prompt + prompt = Prompt(metrics=args.evaluation_metrics, input_fields=args.field_map, prompt_dir=args.template_dir) + prompt_template = prompt.template + + # step 6 : start scoring + generation_config = GENERATION_CONFIG[args.evaluation_mode] + tic = time.time() + responses = generate(evaluator, data, prompt_template, generation_config, args) + toc = time.time() + print("Generation time for {} examples = {:.2f} seconds".format(len(data), toc - tic)) + log_responses(args=args, responses=responses) + + print(f"this script took {time.time() - very_start}s.") + \ No newline at end of file diff --git a/evals/evaluation/auto_eval/utils/__init__.py b/evals/evaluation/auto_eval/utils/__init__.py new file mode 100644 index 00000000..7b2e1b9a --- /dev/null +++ b/evals/evaluation/auto_eval/utils/__init__.py @@ -0,0 +1,4 @@ +import sys +import os + +sys.path.append(os.path.dirname(os.path.abspath(__file__))) \ No newline at end of file diff --git a/evals/evaluation/auto_eval/utils/helper.py b/evals/evaluation/auto_eval/utils/helper.py new file mode 100644 index 00000000..5095e299 --- /dev/null +++ b/evals/evaluation/auto_eval/utils/helper.py @@ -0,0 +1,82 @@ +import json +import yaml +import os +import re +import pandas as pd +import numpy as np + +from scipy.stats import pearsonr +from jinja2 import Template + +from sklearn.metrics import mean_squared_error + + +def load_jsonl(data_path): + result = [] + with open(data_path, 'r') as f: + for line in f: + data = json.loads(line) + result.append(data) + return result + + +def load_config(config_path): + + with open(config_path, "r") as file: + config = yaml.safe_load(file) + + return config + + +def compute_mse(x, y): + return mean_squared_error(x, y) + + +def compute_pearson(x, y): + corr, _ = pearsonr(x, y) + return corr + + +def extract_delay_from_rate_limit_error_msg(text): + import re + + pattern = r"retry after (\d+)" + match = re.search(pattern, text) + if match: + retry_time_from_message = match.group(1) + return float(retry_time_from_message) + else: + return 5 + +def render_prompt(template: Template, **kwargs) -> str: + text = template.render(**kwargs) + return text + + +def extract_score(pattern: str, text: str): + match = re.search(pattern, text.lower()) + + if match: + score = int(match.group(1)) + else: + score = 1 + + return score + +def compute_metric_wise_assessment(metrics, groundtruth, prediction): + fine_grained_evaluation = pd.DataFrame(index=metrics) + for i, metric in enumerate(metrics): + fine_grained_evaluation.loc[metric, 'MSE'] = compute_mse(groundtruth[i], prediction[i]) + abs_diff = [abs(g - p) for g, p in zip(groundtruth[i], prediction[i])] + for diff in [0, 1, 2]: + fine_grained_evaluation.loc[metric, "|label - score| <= {}".format(diff)] = sum(val <= diff for val in abs_diff) + return fine_grained_evaluation + +def compute_weighted_assessment(weights, groundtruth, prediction): + weights, groundtruth, prediction = np.array(weights), np.array(groundtruth), np.array(prediction) + weighted_labels = np.sum(weights[:, np.newaxis] * groundtruth, axis=0) + weighted_scores = np.sum(weights[:, np.newaxis] * prediction, axis=0) + mse = compute_mse(weighted_labels, weighted_scores) + pearson_correlation = compute_pearson(weighted_labels, weighted_scores) + return mse, pearson_correlation + diff --git a/evals/evaluation/auto_eval/utils/model.py b/evals/evaluation/auto_eval/utils/model.py new file mode 100644 index 00000000..d665c9c0 --- /dev/null +++ b/evals/evaluation/auto_eval/utils/model.py @@ -0,0 +1,70 @@ +from typing import List +import torch +import openai + +from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer +from transformers import pipeline +from transformers import AutoModel + +from .retry import retry_and_handle_exceptions +from .helper import extract_delay_from_rate_limit_error_msg + +from huggingface_hub import InferenceClient + + +class EndpointEvaluator: + def __init__(self, model_name): + client = InferenceClient(base_url="{}/v1/".format(model_name)) + + def generate(self, messages, **kwargs): + output = client.chat.completions.create( + model="tgi", + messages=messages, + stream=True, + **kwargs, + ) + for chunk in output: + print(chunk.choices[0].delta.content) + +class HFEvaluator: + def __init__(self, model_name): + self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + device_map = "auto" if torch.cuda.is_available() else "cpu" + if device_map == 'cpu': + self.pipe = pipeline("text-generation", model=model_name, tokenizer=self.tokenizer, + torch_dtype=torch.bfloat16, device_map="cpu") + else: + self.pipe = pipeline("text-generation", model=model_name, tokenizer=self.tokenizer, + torch_dtype=torch.float16, device_map="auto") + + def generate(self, messages, **kwargs) -> List[float]: + + prompt = self.pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + outputs = self.pipe(prompt, **kwargs, return_full_text=False) + result = outputs[0]["generated_text"] + return result + + + +class OAIEvaluator: + def __init__(self, openai_key, model_name): + openai.api_key = openai_key + self.model_name = model_name + + @retry_and_handle_exceptions( + exception_to_check=( + openai.RateLimitError, + openai.APIError, + KeyError, + ), + max_retries=5, + extract_delay_from_error_message=extract_delay_from_rate_limit_error_msg, + ) + def generate(self, messages: list, **kwargs) -> List[float]: + return openai.chat.completions.create( + model=self.model_name, + messages=messages, + **kwargs, + ).choices[0].message.content + + diff --git a/evals/evaluation/auto_eval/utils/retry.py b/evals/evaluation/auto_eval/utils/retry.py new file mode 100644 index 00000000..573544e7 --- /dev/null +++ b/evals/evaluation/auto_eval/utils/retry.py @@ -0,0 +1,47 @@ +from typing import Tuple, Union, Optional +import functools +import time +import random + + +def retry_and_handle_exceptions( + exception_to_check: Union[Exception, Tuple[Exception]], + max_retries: int = 3, + initial_delay: float = 1, + exponential_base: float = 2, + jitter: bool = False, + extract_delay_from_error_message: Optional[any] = None, +): + def deco_retry(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + delay = initial_delay + for i in range(max_retries): + try: + return func(*args, **kwargs) + except exception_to_check as e: + if i == max_retries - 1: + raise Exception( + "Func execution failed after {0} retries: {1}".format( + max_retries, e + ) + ) + delay *= exponential_base * (1 + jitter * random.random()) + delay_from_error_message = None + if extract_delay_from_error_message is not None: + delay_from_error_message = extract_delay_from_error_message( + str(e) + ) + final_delay = ( + delay_from_error_message if delay_from_error_message else delay + ) + print( + "Func execution failed. Retrying in {0} seconds: {1}".format( + final_delay, e + ) + ) + time.sleep(final_delay) + + return wrapper + + return deco_retry From f998a00fd4faed21e0e1b967c0c1f77a692966da Mon Sep 17 00:00:00 2001 From: aasavari Date: Tue, 24 Sep 2024 05:37:36 +0000 Subject: [PATCH 10/32] auto-eval endpoint on gaudi - tested and working Signed-off-by: aasavari --- evals/evaluation/auto_eval/run_eval.py | 8 ++++---- evals/evaluation/auto_eval/utils/model.py | 10 ++++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/evals/evaluation/auto_eval/run_eval.py b/evals/evaluation/auto_eval/run_eval.py index 882846bc..6a0c3f8f 100644 --- a/evals/evaluation/auto_eval/run_eval.py +++ b/evals/evaluation/auto_eval/run_eval.py @@ -14,7 +14,7 @@ GENERATION_CONFIG = { "openai" : {"temperature" : 0.1}, - "endpoint" : {"max_new_tokens" : 500}, + "endpoint" : {"max_tokens": 500}, "local" : {"max_new_tokens" : 500} } @@ -45,17 +45,17 @@ def get_args(): ) parser.add_argument( "--openai_key", type=str, - default="add your OpenAI token", + default="", help="please provide your OpenAI key" ) parser.add_argument( "--evaluation_mode", type=str, - default="openai", + default="endpoint", help="evaluation mode can be openai / endpoint / local" ) parser.add_argument( "--model_name", type=str, - default="gpt-4o", + default="http://localhost:8085", help="the model to be used for evaluation" ) parser.add_argument( diff --git a/evals/evaluation/auto_eval/utils/model.py b/evals/evaluation/auto_eval/utils/model.py index d665c9c0..5b113bb4 100644 --- a/evals/evaluation/auto_eval/utils/model.py +++ b/evals/evaluation/auto_eval/utils/model.py @@ -14,17 +14,19 @@ class EndpointEvaluator: def __init__(self, model_name): - client = InferenceClient(base_url="{}/v1/".format(model_name)) + self.client = InferenceClient(base_url="{}/v1/chat/completions".format(model_name)) def generate(self, messages, **kwargs): - output = client.chat.completions.create( + output = self.client.chat.completions.create( model="tgi", messages=messages, stream=True, **kwargs, ) - for chunk in output: - print(chunk.choices[0].delta.content) + response = [chunk.choices[0].delta.content for chunk in output] + response = [content for content in response if content] + response = ' '.join(response) + return response class HFEvaluator: def __init__(self, model_name): From 6bfadb2206d518dd329f2c43e9695f78a072c414 Mon Sep 17 00:00:00 2001 From: aasavari Date: Tue, 24 Sep 2024 23:18:52 +0000 Subject: [PATCH 11/32] updating testing environment Signed-off-by: aasavari --- tests/requirements.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/requirements.txt b/tests/requirements.txt index d2cd20b0..61a09ff4 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,6 +1,10 @@ bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@e5c2f31625223431d7987f43b70b75b9d26ba118 +datasets jieba +jsonlines langchain_community langchain_huggingface lm-eval==0.4.3 +openai +python-dotenv ragas From bd0d2af1344c9fa200d1b85df97053ed423d6ac7 Mon Sep 17 00:00:00 2001 From: aasavari Date: Wed, 25 Sep 2024 00:57:23 +0000 Subject: [PATCH 12/32] adding unit test for auto eval - passing successfully Signed-off-by: aasavari --- .../auto_eval/{dataset.py => rag_dataset.py} | 0 evals/evaluation/auto_eval/run_eval.py | 174 +++++++++++------- evals/metrics/ragas/ragas.py | 16 -- tests/test_auto_eval.py | 56 ++++++ 4 files changed, 168 insertions(+), 78 deletions(-) rename evals/evaluation/auto_eval/{dataset.py => rag_dataset.py} (100%) create mode 100644 tests/test_auto_eval.py diff --git a/evals/evaluation/auto_eval/dataset.py b/evals/evaluation/auto_eval/rag_dataset.py similarity index 100% rename from evals/evaluation/auto_eval/dataset.py rename to evals/evaluation/auto_eval/rag_dataset.py diff --git a/evals/evaluation/auto_eval/run_eval.py b/evals/evaluation/auto_eval/run_eval.py index 6a0c3f8f..4591b7d0 100644 --- a/evals/evaluation/auto_eval/run_eval.py +++ b/evals/evaluation/auto_eval/run_eval.py @@ -1,22 +1,18 @@ import argparse import ast -from dataset import RAGDataset +from evals.evaluation.auto_eval.rag_dataset import RAGDataset from dotenv import load_dotenv from huggingface_hub import login from jinja2 import Environment, FileSystemLoader import json import os import pandas as pd -from prompt_engineering import Prompt +from evals.evaluation.auto_eval.prompt_engineering import Prompt import time -from utils.model import * -from utils.helper import * +from evals.evaluation.auto_eval.utils.model import * +from evals.evaluation.auto_eval.utils.helper import * + -GENERATION_CONFIG = { - "openai" : {"temperature" : 0.1}, - "endpoint" : {"max_tokens": 500}, - "local" : {"max_new_tokens" : 500} -} def get_args(): parser = argparse.ArgumentParser() @@ -78,18 +74,7 @@ def load_template(template_path): return template -def generate(evaluator, data, template, generation_config, args): - responses = [] - for sample in data: - print(sample['question']) - prompt = render_prompt(template, query=sample['question'], answer=sample['answer'], context=sample['context']) - messages = [{"role": "user", "content": prompt}] - response = evaluator.generate(messages, **generation_config) - print(response) - responses.append(response) - print("-"*100) - break - return responses + def log_responses(responses, args): @@ -99,48 +84,113 @@ def log_responses(responses, args): f.write(text) -if __name__ == "__main__": - very_start = time.time() +class AutoEvaluate: - # step 1 : load dot environment - dot_env_path = os.path.join(os.path.dirname(__file__), ".env") - print("Loading dot environment from {}".format(dot_env_path)) - load_dotenv(dot_env_path, override=True) - # step 2 : validate and load input args - args = get_args() - - # step 3 : load dataset - data = RAGDataset(dataset=args.input_data, - field_map=args.field_map, - mode=args.data_mode) - - # step 4 : load LLM - if args.evaluation_mode == "openai": - # assert args.model_name in ALLOWED_OPENAI_MODELS, "please provide a openai model from the given list of allowed models" - print("Using {} openai key".format(args.openai_key)) - evaluator = OAIEvaluator(args.openai_key, args.model_name) - elif args.evaluation_mode == "endpoint": - print("Loading HF endpoint at {}".format(args.model_name)) - evaluator = EndpointEvaluator(args.model_name) - else: - assert args.evaluation_mode == "local", "evaluation mode must be openai / endpoint / local" - print("Loading {} model locally".format(args.model_name)) - login(token=args.hf_token) - evaluator = HFEvaluator(args.model_name) - - # step 5 : load prompt - prompt = Prompt(metrics=args.evaluation_metrics, input_fields=args.field_map, prompt_dir=args.template_dir) - prompt_template = prompt.template - - # step 6 : start scoring - generation_config = GENERATION_CONFIG[args.evaluation_mode] - tic = time.time() - responses = generate(evaluator, data, prompt_template, generation_config, args) - toc = time.time() - print("Generation time for {} examples = {:.2f} seconds".format(len(data), toc - tic)) - log_responses(args=args, responses=responses) - - print(f"this script took {time.time() - very_start}s.") + + def __init__(self, + dataset, + data_mode, + field_map, + template_dir, + evaluation_mode, + model_name, + evaluation_metrics, + hf_token=None, + openai_key=None, + debug_mode=None + ): + self.GENERATION_CONFIG = { + "openai" : {"temperature" : 0.1}, + "endpoint" : {"max_tokens": 500}, + "local" : {"max_new_tokens" : 500} + } + self.load_env() + self.data = RAGDataset(dataset=dataset, field_map=field_map, mode=data_mode) + self.evaluator = self.get_evaluator(evaluation_mode, + model_name, + openai_key, + hf_token) + self.prompt_template = self.get_template(evaluation_metrics, field_map, template_dir) + self.debug_mode = debug_mode + self.generation_config = self.GENERATION_CONFIG[evaluation_mode] + + def load_env(self,): + dot_env_path = os.path.join(os.path.dirname(__file__), ".env") + print("Loading dot environment from {}".format(dot_env_path)) + load_dotenv(dot_env_path, override=True) + + def get_evaluator(self, evaluation_mode, model_name, openai_key=None, hf_token=None): + if evaluation_mode == "openai": + # assert args.model_name in ALLOWED_OPENAI_MODELS, "please provide a openai model from the given list of allowed models" + print("Using {} openai key".format(openai_key)) + evaluator = OAIEvaluator(openai_key, model_name) + elif evaluation_mode == "endpoint": + print("Loading HF endpoint at {}".format(model_name)) + evaluator = EndpointEvaluator(model_name) + else: + assert args.evaluation_mode == "local", "evaluation mode must be openai / endpoint / local" + print("Loading {} model locally".format(model_name)) + login(token=hf_token) + evaluator = HFEvaluator(args.model_name) + return evaluator + + def get_template(self, evaluation_metrics, field_map, template_dir): + + return Prompt(metrics=evaluation_metrics, + input_fields=field_map, + prompt_dir=template_dir).template + + def measure(self): + n_samples = 1 if self.debug_mode else len(self.data) + responses = [''] * n_samples + start = time.time() + for i in range(n_samples): + prompt = render_prompt(self.prompt_template, + query=self.data[i]['question'], + answer=self.data[i]['answer'], + context=self.data[i]['context']) + messages = [{"role": "user", "content": prompt}] + response = self.evaluator.generate(messages, **self.generation_config) + responses[i] = response + end = time.time() + print("Generation of scores and reasoning took {:.2f} seconds for {:,} examples".format(end-start, n_samples)) + return responses + + +# if __name__ == "__main__": + +# dataset = "explodinggradients/ragas-wikiqa" +# data_mode = "benchmarking" +# field_map = { +# 'question' : 'question', +# 'answer' : 'generated_with_rag', +# 'context' : 'context' +# } + +# template_dir = "auto_eval_metrics" + +# evaluation_mode = "endpoint" +# model_name = "http://localhost:8085" + +# evaluation_metrics = ["factualness", +# "relevance", +# "correctness", +# "readability"] + +# evaluator = AutoEvaluate(dataset=dataset, +# data_mode=data_mode, +# field_map=field_map, +# template_dir=template_dir, +# evaluation_mode=evaluation_mode, +# model_name=model_name, +# evaluation_metrics=evaluation_metrics, +# debug_mode=True) + +# responses = evaluator.measure() + +# for response in responses: +# print(response) +# print("-"*100) \ No newline at end of file diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py index e94c8008..9b0a1d3e 100644 --- a/evals/metrics/ragas/ragas.py +++ b/evals/metrics/ragas/ragas.py @@ -47,11 +47,7 @@ def measure(self, test_case: Dict): # sends to server try: from ragas import evaluate -<<<<<<< HEAD - from ragas.metrics import ( # reference_free_rubrics_score, -======= from ragas.metrics import ( ->>>>>>> upstream/main answer_correctness, answer_relevancy, answer_similarity, @@ -121,23 +117,12 @@ def measure(self, test_case: Dict): ] # Find necessary input fields using the given metrics _required_columns = set() -<<<<<<< HEAD - for metric in self.metrics: - for column in list(metric._required_columns.values())[0]: - _required_columns.add(column) - column2field = { -======= column_map = { # this column maps new naming style in ragas to their old naming style ->>>>>>> upstream/main "user_input": "question", "response": "answer", "reference": "ground_truth", "retrieved_contexts": "contexts", } -<<<<<<< HEAD - _required_fields = [column2field[column] for column in _required_columns] - data = {field: test_case[field] for field in _required_fields} -======= for metric in self.metrics: if hasattr(metric, "_required_columns"): for column in list(metric._required_columns.values())[0]: @@ -154,7 +139,6 @@ def measure(self, test_case: Dict): # get only necessary columns from test case data = {column: test_case[column] for column in _required_columns} ->>>>>>> upstream/main dataset = Dataset.from_dict(data) # evaluate diff --git a/tests/test_auto_eval.py b/tests/test_auto_eval.py new file mode 100644 index 00000000..354656c7 --- /dev/null +++ b/tests/test_auto_eval.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +import os +import unittest + +# import sys +# sys.path.insert(0, "~/GenAIEval/evals") + +from evals.evaluation.auto_eval.run_eval import AutoEvaluate + +host_ip = os.getenv("host_ip", "localhost") +port = os.getenv("port", "8085") + +class TestRagasMetric(unittest.TestCase): + + # @unittest.skip("need pass localhost id") + def test_ragas(self): + + dataset = "explodinggradients/ragas-wikiqa" + data_mode = "benchmarking" + field_map = { + 'question' : 'question', + 'answer' : 'generated_with_rag', + 'context' : 'context' + } + + template_dir = "auto_eval_metrics" + + evaluation_mode = "endpoint" + model_name = "http://localhost:8085" + + evaluation_metrics = ["factualness", + "relevance", + "correctness", + "readability"] + + evaluator = AutoEvaluate(dataset=dataset, + data_mode=data_mode, + field_map=field_map, + template_dir=template_dir, + evaluation_mode=evaluation_mode, + model_name=model_name, + evaluation_metrics=evaluation_metrics, + debug_mode=True) + + responses = evaluator.measure() + + for response in responses: + print(response) + +if __name__ == "__main__": + unittest.main() From 85b1ff9b998e6148e56fa2cefee6ec7e5c7b5bc0 Mon Sep 17 00:00:00 2001 From: aasavari Date: Wed, 25 Sep 2024 01:02:33 +0000 Subject: [PATCH 13/32] editing parameters for online test environment Signed-off-by: aasavari --- tests/test_auto_eval.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/test_auto_eval.py b/tests/test_auto_eval.py index 354656c7..494edeb6 100644 --- a/tests/test_auto_eval.py +++ b/tests/test_auto_eval.py @@ -7,13 +7,10 @@ import os import unittest -# import sys -# sys.path.insert(0, "~/GenAIEval/evals") - from evals.evaluation.auto_eval.run_eval import AutoEvaluate host_ip = os.getenv("host_ip", "localhost") -port = os.getenv("port", "8085") +port = os.getenv("port", "8008") class TestRagasMetric(unittest.TestCase): @@ -31,7 +28,7 @@ def test_ragas(self): template_dir = "auto_eval_metrics" evaluation_mode = "endpoint" - model_name = "http://localhost:8085" + model_name = f"http://{host_ip}:{port}" evaluation_metrics = ["factualness", "relevance", From 225f1a78ba9702f518539f5455e0827c228259cb Mon Sep 17 00:00:00 2001 From: aasavari Date: Wed, 25 Sep 2024 01:15:11 +0000 Subject: [PATCH 14/32] added working example with endpoint to README Signed-off-by: aasavari --- evals/evaluation/auto_eval/README.md | 53 +++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 9 deletions(-) diff --git a/evals/evaluation/auto_eval/README.md b/evals/evaluation/auto_eval/README.md index 64080f2e..09067564 100644 --- a/evals/evaluation/auto_eval/README.md +++ b/evals/evaluation/auto_eval/README.md @@ -8,20 +8,55 @@ AutoEval is best suited for Long Form Question Answering (LFQA) datasets where y > Note : To use benchmarking datasets, set argument `data_mode=benchmarking`. Similarly, to use custom datasets, set `data_mode=local`. ### Model AutoEval can run in 3 evaluation modes - -1. `evaluation_mode=endpoint` uses HuggingFace endpoint. +1. `evaluation_mode="endpoint"` uses HuggingFace endpoint. - We recommend launching a HuggingFace endpoint on Gaudi AI accelerator machines to ensure maximum usage and performance. - To launch HF endpoint on Gaudi2, please follow the 2-step instructions here - [tgi-gaudi](https://github.com/huggingface/tgi-gaudi). - Pass your endpoint url as `model_name` argument. -2. `evaluation_mode=openai` uses openai backend. +2. `evaluation_mode="openai"` uses openai backend. - Please set your `OPEN_API_KEY` and your choice of model as `model_name` argument. -3. `evaluation_mode=local` uses your local hardware. +3. `evaluation_mode="local"` uses your local hardware. - Set `hf_token` argument and set your favourite open-source model in `model_name` argument. -- GPU usage will be prioritized after checking it's availability. Otherwise the model will run on CPU. +- GPU usage will be prioritized after checking it's availability. If GPU is unavailable, the model will run on CPU. ## Metrics -AutoEval provides 4 metrics - factualness, correctness, relevance and readability. You can also add your own metrics and your own grading scales. Don't forget to add your metric to `evaluation_metrics` argument. +AutoEval provides 4 metrics - factualness, correctness, relevance and readability. You can also bring your own metrics and grading scales. Don't forget to add your metric to `evaluation_metrics` argument. ## Generation configuration Please set generation parameters as per your requirement in `GENERATION_CONFIG` in `run_eval.py`. -## Run -```bash -python3 run_eval.py --log_path="./exp1.log" -``` \ No newline at end of file + +## Run using HF endpoint +```python3 +dataset = "explodinggradients/ragas-wikiqa" +data_mode = "benchmarking" +field_map = { + 'question' : 'question', + 'answer' : 'generated_with_rag', + 'context' : 'context' + } + +template_dir = "auto_eval_metrics" + +evaluation_mode = "endpoint" + +host_ip = os.getenv("host_ip", "localhost") +port = os.getenv("port", "") +model_name = f"http://{host_ip}:{port}" + +evaluation_metrics = ["factualness", + "relevance", + "correctness", + "readability"] + +evaluator = AutoEvaluate(dataset=dataset, + data_mode=data_mode, + field_map=field_map, + template_dir=template_dir, + evaluation_mode=evaluation_mode, + model_name=model_name, + evaluation_metrics=evaluation_metrics, + debug_mode=True) + +responses = evaluator.measure() + +for response in responses: + print(response) +``` +That's it! For troubleshooting, please submit an issue and we will get right on it. \ No newline at end of file From 3a67ff376ad5665ced66cd0764a3943aff06243f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 25 Sep 2024 01:23:34 +0000 Subject: [PATCH 15/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- evals/evaluation/auto_eval/README.md | 31 ++-- .../correctness_prompt.md | 4 +- .../factualness_prompt.md | 2 +- .../readability_prompt.md | 2 +- .../relevance_prompt.md | 4 +- .../auto_eval_metrics/opening_prompt.md | 4 +- .../auto_eval/prompt_engineering.py | 59 +++---- evals/evaluation/auto_eval/rag_dataset.py | 56 +++---- evals/evaluation/auto_eval/run_eval.py | 158 ++++++++---------- evals/evaluation/auto_eval/utils/__init__.py | 5 +- evals/evaluation/auto_eval/utils/helper.py | 52 +++--- evals/evaluation/auto_eval/utils/model.py | 68 ++++---- evals/evaluation/auto_eval/utils/retry.py | 27 +-- tests/test_auto_eval.py | 31 ++-- 14 files changed, 244 insertions(+), 259 deletions(-) diff --git a/evals/evaluation/auto_eval/README.md b/evals/evaluation/auto_eval/README.md index 09067564..c260a17e 100644 --- a/evals/evaluation/auto_eval/README.md +++ b/evals/evaluation/auto_eval/README.md @@ -26,11 +26,7 @@ Please set generation parameters as per your requirement in `GENERATION_CONFIG` ```python3 dataset = "explodinggradients/ragas-wikiqa" data_mode = "benchmarking" -field_map = { - 'question' : 'question', - 'answer' : 'generated_with_rag', - 'context' : 'context' - } +field_map = {"question": "question", "answer": "generated_with_rag", "context": "context"} template_dir = "auto_eval_metrics" @@ -40,23 +36,22 @@ host_ip = os.getenv("host_ip", "localhost") port = os.getenv("port", "") model_name = f"http://{host_ip}:{port}" -evaluation_metrics = ["factualness", - "relevance", - "correctness", - "readability"] +evaluation_metrics = ["factualness", "relevance", "correctness", "readability"] -evaluator = AutoEvaluate(dataset=dataset, - data_mode=data_mode, - field_map=field_map, - template_dir=template_dir, - evaluation_mode=evaluation_mode, - model_name=model_name, - evaluation_metrics=evaluation_metrics, - debug_mode=True) +evaluator = AutoEvaluate( + dataset=dataset, + data_mode=data_mode, + field_map=field_map, + template_dir=template_dir, + evaluation_mode=evaluation_mode, + model_name=model_name, + evaluation_metrics=evaluation_metrics, + debug_mode=True, +) responses = evaluator.measure() for response in responses: print(response) ``` -That's it! For troubleshooting, please submit an issue and we will get right on it. \ No newline at end of file +That's it! For troubleshooting, please submit an issue and we will get right on it. diff --git a/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/correctness_prompt.md b/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/correctness_prompt.md index 55f1db85..6cf9ba3e 100644 --- a/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/correctness_prompt.md +++ b/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/correctness_prompt.md @@ -2,5 +2,5 @@ - Score 1: If the answer is empty string or something like "I do not know the answer", the correctness score is 1. - Score 2: If the answer only addresses a small part of the question correctly or it is missing many critical steps/aspects of the answer or the answer is too short to fully answer the question or is missing many steps causing the answer to not fully address the problem described in the question, then the correctness score is 2. - Score 3: The answer mostly addresses the question but one critical aspect/step is missing or is incorrect. - - Score 4: the answer mostly answer the question and covers all critical/main aspects of the question, but itโ€™s missing important/neccessary details about one or more aspects. - - Score 5: the answer correctly and completely addresses the query. It also covers important details about each step. \ No newline at end of file + - Score 4: the answer mostly answer the question and covers all critical/main aspects of the question, but itโ€™s missing important/necessary details about one or more aspects. + - Score 5: the answer correctly and completely addresses the query. It also covers important details about each step. diff --git a/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/factualness_prompt.md b/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/factualness_prompt.md index 0289723a..dd3bdfaf 100644 --- a/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/factualness_prompt.md +++ b/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/factualness_prompt.md @@ -3,4 +3,4 @@ - Score 2: only a small part of the answer is contained in the context but most of it is imaginary/hallucinated or the meaning is completely changed from what is represented in the context. - Score 3: Only about half of the answer is contained in the context. Rest of the answer is hallucinated or imaginary. - Score 4: Most of the claims in the answer can be inferred from the provided context with very little information that is not directly supported by the provided context. - - Score 5: All of the claims in the answer are directly supported by the provided context, demonstrating high faithfulness to the provided context. \ No newline at end of file + - Score 5: All of the claims in the answer are directly supported by the provided context, demonstrating high faithfulness to the provided context. diff --git a/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/readability_prompt.md b/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/readability_prompt.md index 1ddd1345..a059ff0f 100644 --- a/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/readability_prompt.md +++ b/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/readability_prompt.md @@ -3,4 +3,4 @@ - Score 2: the answer is slightly readable, there are irrelevant symbols or HTML tags or repeated words, but it can roughly form a meaningful sentence that can cover some aspects of the answer. - Score 3: Answer can be read but there are grammatical mistakes in the answer. - Score 4: the answer readable, but the readability and style can improved to better appeal to the reader. - - Score 5: the answer is reader friendly and well written. \ No newline at end of file + - Score 5: the answer is reader friendly and well written. diff --git a/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/relevance_prompt.md b/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/relevance_prompt.md index 83f8e248..9a009de3 100644 --- a/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/relevance_prompt.md +++ b/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/relevance_prompt.md @@ -2,5 +2,5 @@ - Score 1: The answer doesn't mention anything about the question or is completely irrelevant to the question. - Score 2: The answer only identifies the domain (e.g. cnvrg) mentioned in the question and provides information from the correct domain. But, the answer does not address the question itself and the point of the question is completely missed by it. - Score 3: The answer correctly identifies the domain and essence of the question but the details in the answer are not relevant to the focus of the question. - - Score 4: The answer correctly identifies domain mentioned the question and essence of the qustion as well as stays consistent with both of them. But there is some part of the answer that is not relevant to the question or it's topic or it's essence. This irrelvant part is damaging the overall relevance of the answer. - - Score 5: The answer is completely relevant to the question and the details do not deviate from the essence of the question. There are no parts of the answer that are irrelevant or unneccessary for the given question. \ No newline at end of file + - Score 4: The answer correctly identifies domain mentioned the question and essence of the question as well as stays consistent with both of them. But there is some part of the answer that is not relevant to the question or it's topic or it's essence. This irrelevant part is damaging the overall relevance of the answer. + - Score 5: The answer is completely relevant to the question and the details do not deviate from the essence of the question. There are no parts of the answer that are irrelevant or unnecessary for the given question. diff --git a/evals/evaluation/auto_eval/auto_eval_metrics/opening_prompt.md b/evals/evaluation/auto_eval/auto_eval_metrics/opening_prompt.md index bc938aa8..72e54a3e 100644 --- a/evals/evaluation/auto_eval/auto_eval_metrics/opening_prompt.md +++ b/evals/evaluation/auto_eval/auto_eval_metrics/opening_prompt.md @@ -6,8 +6,8 @@ You will be given an input consisting of a question, an answer and a context. Yo Important rules for you while completing this task: 1. You MUST ALWAYS provide a score for every metric mentioned below. 2. Make sure to understand definition of every metric fully before completing your task. Every metric is provided with grading scale and rubric. You MUST use this grading scale and rubric to determine your score. -3. Ensure that your scores and reasoning for every metric is indepedent of each other e.g., score for factualness should not impact score for corectness and vice versa. +3. Ensure that your scores and reasoning for every metric is independent of each other e.g., score for factualness should not impact score for correctness and vice versa. 4. Base your grading decision only on the given inputs and do not speculate or hallucinate. 5. You must also provide reasoning for your score in a single sentence. -Your metric definitions along with grading scale and rubric: \ No newline at end of file +Your metric definitions along with grading scale and rubric: diff --git a/evals/evaluation/auto_eval/prompt_engineering.py b/evals/evaluation/auto_eval/prompt_engineering.py index 8e2987c2..31d77a37 100644 --- a/evals/evaluation/auto_eval/prompt_engineering.py +++ b/evals/evaluation/auto_eval/prompt_engineering.py @@ -1,11 +1,15 @@ -from jinja2 import Environment, FileSystemLoader, Template +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import os + from dotenv import load_dotenv +from jinja2 import Environment, FileSystemLoader, Template + class Prompt: + """Class to customize prompt template using user-defined list of metrics.""" - """class to customize prompt template using user-defined list of metrics""" - def __init__(self, metrics, input_fields, prompt_dir): self.metrics = metrics self.input_fields = input_fields @@ -17,9 +21,11 @@ def define_template_paths(self, prompt_dir): metric_prompt_names = ["{}_prompt.md".format(metric) for metric in self.metrics] local_metric_prompt_paths = [os.path.join("metric_prompt_templates", m) for m in metric_prompt_names] self.metric_prompt_paths = [os.path.join(prompt_dir, p) for p in local_metric_prompt_paths] - + def create_grading_format(self): - grading_format = "You must ALWAYS provide every single one of the scores and reasonings in the following JSON format:" + grading_format = ( + "You must ALWAYS provide every single one of the scores and reasonings in the following JSON format:" + ) grading_format += "\n" + "{" + "\n" content = [] reasoning_prompt = "Reasoning for {}: [your one line step by step reasoning about the {} of the answer]" @@ -27,16 +33,16 @@ def create_grading_format(self): for metric in self.metrics: reasoning = reasoning_prompt.format(metric, metric) score = scoring_prompt.format(metric, metric) - content += reasoning + "\n" + score, + content += (reasoning + "\n" + score,) grading_format += "\n\n".join(content) grading_format += "\n" + "}" return grading_format - + def create_closing_prompt(self): closing_prompt = ["Let's begin!"] for f in self.input_fields: - closing_prompt += "Provided {}:".format(f) + '\n' + "{{" + f + "}}", - return '\n\n'.join(closing_prompt) + closing_prompt += ("Provided {}:".format(f) + "\n" + "{{" + f + "}}",) + return "\n\n".join(closing_prompt) @staticmethod def load_template(template_path): @@ -45,13 +51,13 @@ def load_template(template_path): return env.get_template(template_path) def load_prompt_template(self): - content = [self.load_template(self.opening_prompt_path).render()] + content = [self.load_template(self.opening_prompt_path).render()] for path in self.metric_prompt_paths: - content += self.load_template(path).render(), - content += self.create_grading_format(), - content += self.create_closing_prompt(), - return Template('\n\n'.join(content)) - + content += (self.load_template(path).render(),) + content += (self.create_grading_format(),) + content += (self.create_closing_prompt(),) + return Template("\n\n".join(content)) + def render_prompt(self, **kwargs) -> str: text = self.template.render(**kwargs) return text @@ -59,12 +65,12 @@ def render_prompt(self, **kwargs) -> str: if __name__ == "__main__": - """Here, we test implementation of Prompt class""" + """Here, we test implementation of Prompt class.""" # step 0 - user input - metrics = ['factualness', 'relevance', 'correctness', 'readability'] - input_fields = ['question', 'answer', 'context'] - prompt_dir = './auto_eval_metrics/' + metrics = ["factualness", "relevance", "correctness", "readability"] + input_fields = ["question", "answer", "context"] + prompt_dir = "./auto_eval_metrics/" # step 1 - load jinja2 environment load_dotenv(os.path.join(os.path.dirname(__file__), ".env"), override=True) @@ -73,17 +79,14 @@ def render_prompt(self, **kwargs) -> str: prompt = Prompt(metrics=metrics, input_fields=input_fields, prompt_dir=prompt_dir) example = { - "question": "Who is wife of Barak Obama", - "context": "Michelle Obama, wife of Barak Obama (former President of the United States of America) is an attorney. Barak and Michelle Obama have 2 daughters - Malia and Sasha", - "answer": "Michelle Obama", - "ground_truth": "Wife of Barak Obama is Michelle Obama", + "question": "Who is wife of Barak Obama", + "context": "Michelle Obama, wife of Barak Obama (former President of the United States of America) is an attorney. Barak and Michelle Obama have 2 daughters - Malia and Sasha", + "answer": "Michelle Obama", + "ground_truth": "Wife of Barak Obama is Michelle Obama", } rendered_prompt = prompt.render_prompt( - question=example['question'], - answer=example['answer'], - context=example['context']) + question=example["question"], answer=example["answer"], context=example["context"] + ) print(rendered_prompt) - - diff --git a/evals/evaluation/auto_eval/rag_dataset.py b/evals/evaluation/auto_eval/rag_dataset.py index 6dc99e42..a955eae6 100644 --- a/evals/evaluation/auto_eval/rag_dataset.py +++ b/evals/evaluation/auto_eval/rag_dataset.py @@ -1,12 +1,14 @@ -from datasets import Dataset, load_dataset -import jsonlines +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import os -class RAGDataset: +import jsonlines +from datasets import Dataset, load_dataset - """ - Dataset class to store data in HF datasets API format - """ + +class RAGDataset: + """Dataset class to store data in HF datasets API format.""" def __init__(self, dataset, field_map, mode): self.dataset = dataset @@ -25,18 +27,18 @@ def load_data(self): ex = {} for out_field, in_field in self.field_map.items(): if type(obj[in_field]) == list: - ex[out_field] = '\n'.join(obj[in_field]) + ex[out_field] = "\n".join(obj[in_field]) else: ex[out_field] = obj[in_field] data.append(ex) return Dataset.from_list(data) else: data = [] - for obj in load_dataset(self.dataset)['train']: + for obj in load_dataset(self.dataset)["train"]: ex = {} for out_field, in_field in self.field_map.items(): if type(obj[in_field]) == list: - ex[out_field] = '\n'.join(obj[in_field]) + ex[out_field] = "\n".join(obj[in_field]) else: ex[out_field] = obj[in_field] data.append(ex) @@ -49,40 +51,36 @@ def validate_dataset(self): def __getitem__(self, index): return self.data[index] - + def __len__(self): return len(self.data) - + def __iter__(self): return iter(self.data) - + + if __name__ == "__main__": - dataset_path = '../../benchmark/ragas/ground_truth.jsonl' + dataset_path = "../../benchmark/ragas/ground_truth.jsonl" field_map = { - 'question' : 'question', - 'ground_truth' : 'ground_truth', - 'context' : 'context', + "question": "question", + "ground_truth": "ground_truth", + "context": "context", } - ds = RAGDataset(dataset=dataset_path, - field_map=field_map, - mode="local") - + ds = RAGDataset(dataset=dataset_path, field_map=field_map, mode="local") + for i, ex in enumerate(ds): - assert ex['question'] == ds[i]['question'], "index {} does not have correct query".format(i) + assert ex["question"] == ds[i]["question"], "index {} does not have correct query".format(i) dataset = "explodinggradients/ragas-wikiqa" field_map = { - 'question' : 'question', - 'answer' : 'generated_with_rag', - 'context' : 'context', - 'ground_truth' : 'correct_answer' + "question": "question", + "answer": "generated_with_rag", + "context": "context", + "ground_truth": "correct_answer", } ds = RAGDataset(dataset=dataset, field_map=field_map, mode="benchmarking") for i, ex in enumerate(ds): - assert ex['question'] == ds[i]['question'], "index {} does not have correct query".format(i) - - - + assert ex["question"] == ds[i]["question"], "index {} does not have correct query".format(i) diff --git a/evals/evaluation/auto_eval/run_eval.py b/evals/evaluation/auto_eval/run_eval.py index 4591b7d0..76d87a3f 100644 --- a/evals/evaluation/auto_eval/run_eval.py +++ b/evals/evaluation/auto_eval/run_eval.py @@ -1,122 +1,103 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import argparse import ast -from evals.evaluation.auto_eval.rag_dataset import RAGDataset -from dotenv import load_dotenv -from huggingface_hub import login -from jinja2 import Environment, FileSystemLoader import json import os +import time + import pandas as pd +from dotenv import load_dotenv +from huggingface_hub import login +from jinja2 import Environment, FileSystemLoader + from evals.evaluation.auto_eval.prompt_engineering import Prompt -import time -from evals.evaluation.auto_eval.utils.model import * +from evals.evaluation.auto_eval.rag_dataset import RAGDataset from evals.evaluation.auto_eval.utils.helper import * - +from evals.evaluation.auto_eval.utils.model import * def get_args(): parser = argparse.ArgumentParser() parser.add_argument( - "--input_data", type=str, - default="explodinggradients/ragas-wikiqa", - help="path of the input data" + "--input_data", type=str, default="explodinggradients/ragas-wikiqa", help="path of the input data" ) parser.add_argument( - "--data_mode", type=str, - default="benchmarking", - help="mode of data can be local or benchmarking" + "--data_mode", type=str, default="benchmarking", help="mode of data can be local or benchmarking" ) parser.add_argument( - "--field_map", type=dict, - default={'question' : 'question','answer' : 'generated_with_rag','context' : 'context'}, - help="field map that will be used while loading the dataset" + "--field_map", + type=dict, + default={"question": "question", "answer": "generated_with_rag", "context": "context"}, + help="field map that will be used while loading the dataset", ) + parser.add_argument("--template_dir", type=str, default="auto_eval_metrics", help="path to dir of prompt templates") + parser.add_argument("--hf_token", type=str, default="", help="Please provide your HF token") parser.add_argument( - "--template_dir", type=str, default="auto_eval_metrics", - help="path to dir of prompt templates" + "--openai_key", type=str, default="", help="please provide your OpenAI key" ) parser.add_argument( - "--hf_token", type=str, default="", - help="Please provide your HF token" + "--evaluation_mode", type=str, default="endpoint", help="evaluation mode can be openai / endpoint / local" ) parser.add_argument( - "--openai_key", type=str, - default="", - help="please provide your OpenAI key" + "--model_name", type=str, default="http://localhost:8085", help="the model to be used for evaluation" ) parser.add_argument( - "--evaluation_mode", type=str, - default="endpoint", - help="evaluation mode can be openai / endpoint / local" - ) - parser.add_argument( - "--model_name", type=str, - default="http://localhost:8085", - help="the model to be used for evaluation" - ) - parser.add_argument( - "--evaluation_metrics", type=list, - default=["factualness", "relevance", "correctness", "readability"], - help="metrics to be used for evaluation of RAG" - ) - parser.add_argument( - "--log_path", type=str, default="./exp1.log", - help="path of the log file" + "--evaluation_metrics", + type=list, + default=["factualness", "relevance", "correctness", "readability"], + help="metrics to be used for evaluation of RAG", ) + parser.add_argument("--log_path", type=str, default="./exp1.log", help="path of the log file") args = parser.parse_args() return args def load_template(template_path): - template = Environment( - loader=FileSystemLoader(os.path.dirname(os.path.abspath(__file__))) - ).get_template(template_path) - return template - - - + template = Environment(loader=FileSystemLoader(os.path.dirname(os.path.abspath(__file__)))).get_template( + template_path + ) + return template def log_responses(responses, args): - sep = '\n' + '-'*100 + '\n' + sep = "\n" + "-" * 100 + "\n" text = sep.join(responses) - with open(args.log_path, 'w') as f: + with open(args.log_path, "w") as f: f.write(text) - class AutoEvaluate: - - - def __init__(self, - dataset, - data_mode, - field_map, - template_dir, - evaluation_mode, - model_name, - evaluation_metrics, - hf_token=None, - openai_key=None, - debug_mode=None - ): + def __init__( + self, + dataset, + data_mode, + field_map, + template_dir, + evaluation_mode, + model_name, + evaluation_metrics, + hf_token=None, + openai_key=None, + debug_mode=None, + ): self.GENERATION_CONFIG = { - "openai" : {"temperature" : 0.1}, - "endpoint" : {"max_tokens": 500}, - "local" : {"max_new_tokens" : 500} - } + "openai": {"temperature": 0.1}, + "endpoint": {"max_tokens": 500}, + "local": {"max_new_tokens": 500}, + } self.load_env() self.data = RAGDataset(dataset=dataset, field_map=field_map, mode=data_mode) - self.evaluator = self.get_evaluator(evaluation_mode, - model_name, - openai_key, - hf_token) + self.evaluator = self.get_evaluator(evaluation_mode, model_name, openai_key, hf_token) self.prompt_template = self.get_template(evaluation_metrics, field_map, template_dir) self.debug_mode = debug_mode self.generation_config = self.GENERATION_CONFIG[evaluation_mode] - def load_env(self,): + def load_env( + self, + ): dot_env_path = os.path.join(os.path.dirname(__file__), ".env") print("Loading dot environment from {}".format(dot_env_path)) load_dotenv(dot_env_path, override=True) @@ -137,27 +118,27 @@ def get_evaluator(self, evaluation_mode, model_name, openai_key=None, hf_token=N return evaluator def get_template(self, evaluation_metrics, field_map, template_dir): - - return Prompt(metrics=evaluation_metrics, - input_fields=field_map, - prompt_dir=template_dir).template + + return Prompt(metrics=evaluation_metrics, input_fields=field_map, prompt_dir=template_dir).template def measure(self): n_samples = 1 if self.debug_mode else len(self.data) - responses = [''] * n_samples + responses = [""] * n_samples start = time.time() for i in range(n_samples): - prompt = render_prompt(self.prompt_template, - query=self.data[i]['question'], - answer=self.data[i]['answer'], - context=self.data[i]['context']) + prompt = render_prompt( + self.prompt_template, + query=self.data[i]["question"], + answer=self.data[i]["answer"], + context=self.data[i]["context"], + ) messages = [{"role": "user", "content": prompt}] response = self.evaluator.generate(messages, **self.generation_config) responses[i] = response end = time.time() - print("Generation of scores and reasoning took {:.2f} seconds for {:,} examples".format(end-start, n_samples)) - return responses - + print("Generation of scores and reasoning took {:.2f} seconds for {:,} examples".format(end - start, n_samples)) + return responses + # if __name__ == "__main__": @@ -174,9 +155,9 @@ def measure(self): # evaluation_mode = "endpoint" # model_name = "http://localhost:8085" -# evaluation_metrics = ["factualness", -# "relevance", -# "correctness", +# evaluation_metrics = ["factualness", +# "relevance", +# "correctness", # "readability"] # evaluator = AutoEvaluate(dataset=dataset, @@ -193,4 +174,3 @@ def measure(self): # for response in responses: # print(response) # print("-"*100) - \ No newline at end of file diff --git a/evals/evaluation/auto_eval/utils/__init__.py b/evals/evaluation/auto_eval/utils/__init__.py index 7b2e1b9a..c3d7e5cf 100644 --- a/evals/evaluation/auto_eval/utils/__init__.py +++ b/evals/evaluation/auto_eval/utils/__init__.py @@ -1,4 +1,7 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import sys import os -sys.path.append(os.path.dirname(os.path.abspath(__file__))) \ No newline at end of file +sys.path.append(os.path.dirname(os.path.abspath(__file__))) diff --git a/evals/evaluation/auto_eval/utils/helper.py b/evals/evaluation/auto_eval/utils/helper.py index 5095e299..71fdef65 100644 --- a/evals/evaluation/auto_eval/utils/helper.py +++ b/evals/evaluation/auto_eval/utils/helper.py @@ -1,35 +1,37 @@ -import json -import yaml -import os +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +import os import re -import pandas as pd -import numpy as np -from scipy.stats import pearsonr +import numpy as np +import pandas as pd +import yaml from jinja2 import Template - +from scipy.stats import pearsonr from sklearn.metrics import mean_squared_error def load_jsonl(data_path): - result = [] - with open(data_path, 'r') as f: - for line in f: - data = json.loads(line) - result.append(data) - return result + result = [] + with open(data_path, "r") as f: + for line in f: + data = json.loads(line) + result.append(data) + return result def load_config(config_path): - + with open(config_path, "r") as file: config = yaml.safe_load(file) - - return config + + return config def compute_mse(x, y): - return mean_squared_error(x, y) + return mean_squared_error(x, y) def compute_pearson(x, y): @@ -47,7 +49,8 @@ def extract_delay_from_rate_limit_error_msg(text): return float(retry_time_from_message) else: return 5 - + + def render_prompt(template: Template, **kwargs) -> str: text = template.render(**kwargs) return text @@ -58,20 +61,24 @@ def extract_score(pattern: str, text: str): if match: score = int(match.group(1)) - else: + else: score = 1 - + return score + def compute_metric_wise_assessment(metrics, groundtruth, prediction): fine_grained_evaluation = pd.DataFrame(index=metrics) for i, metric in enumerate(metrics): - fine_grained_evaluation.loc[metric, 'MSE'] = compute_mse(groundtruth[i], prediction[i]) + fine_grained_evaluation.loc[metric, "MSE"] = compute_mse(groundtruth[i], prediction[i]) abs_diff = [abs(g - p) for g, p in zip(groundtruth[i], prediction[i])] for diff in [0, 1, 2]: - fine_grained_evaluation.loc[metric, "|label - score| <= {}".format(diff)] = sum(val <= diff for val in abs_diff) + fine_grained_evaluation.loc[metric, "|label - score| <= {}".format(diff)] = sum( + val <= diff for val in abs_diff + ) return fine_grained_evaluation + def compute_weighted_assessment(weights, groundtruth, prediction): weights, groundtruth, prediction = np.array(weights), np.array(groundtruth), np.array(prediction) weighted_labels = np.sum(weights[:, np.newaxis] * groundtruth, axis=0) @@ -79,4 +86,3 @@ def compute_weighted_assessment(weights, groundtruth, prediction): mse = compute_mse(weighted_labels, weighted_scores) pearson_correlation = compute_pearson(weighted_labels, weighted_scores) return mse, pearson_correlation - diff --git a/evals/evaluation/auto_eval/utils/model.py b/evals/evaluation/auto_eval/utils/model.py index 5b113bb4..b0358cba 100644 --- a/evals/evaluation/auto_eval/utils/model.py +++ b/evals/evaluation/auto_eval/utils/model.py @@ -1,44 +1,55 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + from typing import List -import torch -import openai -from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer -from transformers import pipeline -from transformers import AutoModel +import openai +import torch +from huggingface_hub import InferenceClient +from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, TextStreamer, pipeline -from .retry import retry_and_handle_exceptions from .helper import extract_delay_from_rate_limit_error_msg - -from huggingface_hub import InferenceClient +from .retry import retry_and_handle_exceptions class EndpointEvaluator: def __init__(self, model_name): self.client = InferenceClient(base_url="{}/v1/chat/completions".format(model_name)) - + def generate(self, messages, **kwargs): output = self.client.chat.completions.create( - model="tgi", - messages=messages, - stream=True, - **kwargs, - ) + model="tgi", + messages=messages, + stream=True, + **kwargs, + ) response = [chunk.choices[0].delta.content for chunk in output] response = [content for content in response if content] - response = ' '.join(response) + response = " ".join(response) return response + class HFEvaluator: def __init__(self, model_name): self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) device_map = "auto" if torch.cuda.is_available() else "cpu" - if device_map == 'cpu': - self.pipe = pipeline("text-generation", model=model_name, tokenizer=self.tokenizer, - torch_dtype=torch.bfloat16, device_map="cpu") + if device_map == "cpu": + self.pipe = pipeline( + "text-generation", + model=model_name, + tokenizer=self.tokenizer, + torch_dtype=torch.bfloat16, + device_map="cpu", + ) else: - self.pipe = pipeline("text-generation", model=model_name, tokenizer=self.tokenizer, - torch_dtype=torch.float16, device_map="auto") - + self.pipe = pipeline( + "text-generation", + model=model_name, + tokenizer=self.tokenizer, + torch_dtype=torch.float16, + device_map="auto", + ) + def generate(self, messages, **kwargs) -> List[float]: prompt = self.pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) @@ -47,12 +58,11 @@ def generate(self, messages, **kwargs) -> List[float]: return result - class OAIEvaluator: def __init__(self, openai_key, model_name): openai.api_key = openai_key - self.model_name = model_name - + self.model_name = model_name + @retry_and_handle_exceptions( exception_to_check=( openai.RateLimitError, @@ -63,10 +73,12 @@ def __init__(self, openai_key, model_name): extract_delay_from_error_message=extract_delay_from_rate_limit_error_msg, ) def generate(self, messages: list, **kwargs) -> List[float]: - return openai.chat.completions.create( + return ( + openai.chat.completions.create( model=self.model_name, messages=messages, **kwargs, - ).choices[0].message.content - - + ) + .choices[0] + .message.content + ) diff --git a/evals/evaluation/auto_eval/utils/retry.py b/evals/evaluation/auto_eval/utils/retry.py index 573544e7..bde26409 100644 --- a/evals/evaluation/auto_eval/utils/retry.py +++ b/evals/evaluation/auto_eval/utils/retry.py @@ -1,7 +1,10 @@ -from typing import Tuple, Union, Optional +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import functools -import time import random +import time +from typing import Optional, Tuple, Union def retry_and_handle_exceptions( @@ -21,25 +24,13 @@ def wrapper(*args, **kwargs): return func(*args, **kwargs) except exception_to_check as e: if i == max_retries - 1: - raise Exception( - "Func execution failed after {0} retries: {1}".format( - max_retries, e - ) - ) + raise Exception("Func execution failed after {0} retries: {1}".format(max_retries, e)) delay *= exponential_base * (1 + jitter * random.random()) delay_from_error_message = None if extract_delay_from_error_message is not None: - delay_from_error_message = extract_delay_from_error_message( - str(e) - ) - final_delay = ( - delay_from_error_message if delay_from_error_message else delay - ) - print( - "Func execution failed. Retrying in {0} seconds: {1}".format( - final_delay, e - ) - ) + delay_from_error_message = extract_delay_from_error_message(str(e)) + final_delay = delay_from_error_message if delay_from_error_message else delay + print("Func execution failed. Retrying in {0} seconds: {1}".format(final_delay, e)) time.sleep(final_delay) return wrapper diff --git a/tests/test_auto_eval.py b/tests/test_auto_eval.py index 494edeb6..aee21f8d 100644 --- a/tests/test_auto_eval.py +++ b/tests/test_auto_eval.py @@ -12,6 +12,7 @@ host_ip = os.getenv("host_ip", "localhost") port = os.getenv("port", "8008") + class TestRagasMetric(unittest.TestCase): # @unittest.skip("need pass localhost id") @@ -19,35 +20,31 @@ def test_ragas(self): dataset = "explodinggradients/ragas-wikiqa" data_mode = "benchmarking" - field_map = { - 'question' : 'question', - 'answer' : 'generated_with_rag', - 'context' : 'context' - } + field_map = {"question": "question", "answer": "generated_with_rag", "context": "context"} template_dir = "auto_eval_metrics" evaluation_mode = "endpoint" model_name = f"http://{host_ip}:{port}" - evaluation_metrics = ["factualness", - "relevance", - "correctness", - "readability"] + evaluation_metrics = ["factualness", "relevance", "correctness", "readability"] - evaluator = AutoEvaluate(dataset=dataset, - data_mode=data_mode, - field_map=field_map, - template_dir=template_dir, - evaluation_mode=evaluation_mode, - model_name=model_name, - evaluation_metrics=evaluation_metrics, - debug_mode=True) + evaluator = AutoEvaluate( + dataset=dataset, + data_mode=data_mode, + field_map=field_map, + template_dir=template_dir, + evaluation_mode=evaluation_mode, + model_name=model_name, + evaluation_metrics=evaluation_metrics, + debug_mode=True, + ) responses = evaluator.measure() for response in responses: print(response) + if __name__ == "__main__": unittest.main() From 9e2714e331b65e1ee9408704ee4f50b8cfd46b1b Mon Sep 17 00:00:00 2001 From: aasavari Date: Wed, 25 Sep 2024 02:07:25 +0000 Subject: [PATCH 16/32] added init file and changed import paths accordingly Signed-off-by: aasavari --- evals/evaluation/auto_eval/__init__.py | 10 +++++ evals/evaluation/auto_eval/run_eval.py | 59 ++------------------------ tests/test_auto_eval.py | 2 +- 3 files changed, 15 insertions(+), 56 deletions(-) create mode 100644 evals/evaluation/auto_eval/__init__.py diff --git a/evals/evaluation/auto_eval/__init__.py b/evals/evaluation/auto_eval/__init__.py new file mode 100644 index 00000000..317f6c3b --- /dev/null +++ b/evals/evaluation/auto_eval/__init__.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# + +from .run_eval import AutoEvaluate + +__all__ = [AutoEvaluate] \ No newline at end of file diff --git a/evals/evaluation/auto_eval/run_eval.py b/evals/evaluation/auto_eval/run_eval.py index 76d87a3f..9c158297 100644 --- a/evals/evaluation/auto_eval/run_eval.py +++ b/evals/evaluation/auto_eval/run_eval.py @@ -12,61 +12,10 @@ from huggingface_hub import login from jinja2 import Environment, FileSystemLoader -from evals.evaluation.auto_eval.prompt_engineering import Prompt -from evals.evaluation.auto_eval.rag_dataset import RAGDataset -from evals.evaluation.auto_eval.utils.helper import * -from evals.evaluation.auto_eval.utils.model import * - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_data", type=str, default="explodinggradients/ragas-wikiqa", help="path of the input data" - ) - parser.add_argument( - "--data_mode", type=str, default="benchmarking", help="mode of data can be local or benchmarking" - ) - parser.add_argument( - "--field_map", - type=dict, - default={"question": "question", "answer": "generated_with_rag", "context": "context"}, - help="field map that will be used while loading the dataset", - ) - parser.add_argument("--template_dir", type=str, default="auto_eval_metrics", help="path to dir of prompt templates") - parser.add_argument("--hf_token", type=str, default="", help="Please provide your HF token") - parser.add_argument( - "--openai_key", type=str, default="", help="please provide your OpenAI key" - ) - parser.add_argument( - "--evaluation_mode", type=str, default="endpoint", help="evaluation mode can be openai / endpoint / local" - ) - parser.add_argument( - "--model_name", type=str, default="http://localhost:8085", help="the model to be used for evaluation" - ) - parser.add_argument( - "--evaluation_metrics", - type=list, - default=["factualness", "relevance", "correctness", "readability"], - help="metrics to be used for evaluation of RAG", - ) - parser.add_argument("--log_path", type=str, default="./exp1.log", help="path of the log file") - args = parser.parse_args() - return args - - -def load_template(template_path): - template = Environment(loader=FileSystemLoader(os.path.dirname(os.path.abspath(__file__)))).get_template( - template_path - ) - return template - - -def log_responses(responses, args): - sep = "\n" + "-" * 100 + "\n" - text = sep.join(responses) - with open(args.log_path, "w") as f: - f.write(text) - +from .prompt_engineering import Prompt +from .rag_dataset import RAGDataset +from .utils.helper import * +from .utils.model import * class AutoEvaluate: diff --git a/tests/test_auto_eval.py b/tests/test_auto_eval.py index aee21f8d..a7c30e7e 100644 --- a/tests/test_auto_eval.py +++ b/tests/test_auto_eval.py @@ -7,7 +7,7 @@ import os import unittest -from evals.evaluation.auto_eval.run_eval import AutoEvaluate +from evals.evaluation.auto_eval import AutoEvaluate host_ip = os.getenv("host_ip", "localhost") port = os.getenv("port", "8008") From 2079a4d03a9c9aaed92c6a8cc9bd20c72e39e89e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 25 Sep 2024 02:04:37 +0000 Subject: [PATCH 17/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- evals/evaluation/auto_eval/__init__.py | 2 +- evals/evaluation/auto_eval/run_eval.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/evals/evaluation/auto_eval/__init__.py b/evals/evaluation/auto_eval/__init__.py index 317f6c3b..e4892b7d 100644 --- a/evals/evaluation/auto_eval/__init__.py +++ b/evals/evaluation/auto_eval/__init__.py @@ -7,4 +7,4 @@ from .run_eval import AutoEvaluate -__all__ = [AutoEvaluate] \ No newline at end of file +__all__ = [AutoEvaluate] diff --git a/evals/evaluation/auto_eval/run_eval.py b/evals/evaluation/auto_eval/run_eval.py index 9c158297..538154f5 100644 --- a/evals/evaluation/auto_eval/run_eval.py +++ b/evals/evaluation/auto_eval/run_eval.py @@ -17,6 +17,7 @@ from .utils.helper import * from .utils.model import * + class AutoEvaluate: def __init__( From b4fa896a77f7f31e26a56600284649aa9b1fed16 Mon Sep 17 00:00:00 2001 From: aasavari Date: Wed, 25 Sep 2024 03:08:22 +0000 Subject: [PATCH 18/32] automatically setting template_dir param Signed-off-by: aasavari --- evals/evaluation/auto_eval/run_eval.py | 44 ++------------------------ tests/test_auto_eval.py | 3 -- 2 files changed, 3 insertions(+), 44 deletions(-) diff --git a/evals/evaluation/auto_eval/run_eval.py b/evals/evaluation/auto_eval/run_eval.py index 538154f5..2511c787 100644 --- a/evals/evaluation/auto_eval/run_eval.py +++ b/evals/evaluation/auto_eval/run_eval.py @@ -25,7 +25,6 @@ def __init__( dataset, data_mode, field_map, - template_dir, evaluation_mode, model_name, evaluation_metrics, @@ -41,7 +40,7 @@ def __init__( self.load_env() self.data = RAGDataset(dataset=dataset, field_map=field_map, mode=data_mode) self.evaluator = self.get_evaluator(evaluation_mode, model_name, openai_key, hf_token) - self.prompt_template = self.get_template(evaluation_metrics, field_map, template_dir) + self.prompt_template = self.get_template(evaluation_metrics, field_map) self.debug_mode = debug_mode self.generation_config = self.GENERATION_CONFIG[evaluation_mode] @@ -67,9 +66,8 @@ def get_evaluator(self, evaluation_mode, model_name, openai_key=None, hf_token=N evaluator = HFEvaluator(args.model_name) return evaluator - def get_template(self, evaluation_metrics, field_map, template_dir): - - return Prompt(metrics=evaluation_metrics, input_fields=field_map, prompt_dir=template_dir).template + def get_template(self, evaluation_metrics, field_map): + return Prompt(metrics=evaluation_metrics, input_fields=field_map, prompt_dir="./auto_eval_metrics").template def measure(self): n_samples = 1 if self.debug_mode else len(self.data) @@ -88,39 +86,3 @@ def measure(self): end = time.time() print("Generation of scores and reasoning took {:.2f} seconds for {:,} examples".format(end - start, n_samples)) return responses - - -# if __name__ == "__main__": - -# dataset = "explodinggradients/ragas-wikiqa" -# data_mode = "benchmarking" -# field_map = { -# 'question' : 'question', -# 'answer' : 'generated_with_rag', -# 'context' : 'context' -# } - -# template_dir = "auto_eval_metrics" - -# evaluation_mode = "endpoint" -# model_name = "http://localhost:8085" - -# evaluation_metrics = ["factualness", -# "relevance", -# "correctness", -# "readability"] - -# evaluator = AutoEvaluate(dataset=dataset, -# data_mode=data_mode, -# field_map=field_map, -# template_dir=template_dir, -# evaluation_mode=evaluation_mode, -# model_name=model_name, -# evaluation_metrics=evaluation_metrics, -# debug_mode=True) - -# responses = evaluator.measure() - -# for response in responses: -# print(response) -# print("-"*100) diff --git a/tests/test_auto_eval.py b/tests/test_auto_eval.py index a7c30e7e..90ab55eb 100644 --- a/tests/test_auto_eval.py +++ b/tests/test_auto_eval.py @@ -22,8 +22,6 @@ def test_ragas(self): data_mode = "benchmarking" field_map = {"question": "question", "answer": "generated_with_rag", "context": "context"} - template_dir = "auto_eval_metrics" - evaluation_mode = "endpoint" model_name = f"http://{host_ip}:{port}" @@ -33,7 +31,6 @@ def test_ragas(self): dataset=dataset, data_mode=data_mode, field_map=field_map, - template_dir=template_dir, evaluation_mode=evaluation_mode, model_name=model_name, evaluation_metrics=evaluation_metrics, From f266c3c512390f97f08d96feb9a37c330136bceb Mon Sep 17 00:00:00 2001 From: aasavari Date: Wed, 25 Sep 2024 23:33:53 +0000 Subject: [PATCH 19/32] moved auto_eval to metrics and generalized opening prompt Signed-off-by: aasavari --- evals/{evaluation => metrics}/auto_eval/.env | 0 evals/{evaluation => metrics}/auto_eval/README.md | 0 evals/{evaluation => metrics}/auto_eval/__init__.py | 0 .../metric_prompt_templates/correctness_prompt.md | 0 .../metric_prompt_templates/factualness_prompt.md | 0 .../metric_prompt_templates/readability_prompt.md | 0 .../metric_prompt_templates/relevance_prompt.md | 0 .../auto_eval/auto_eval_metrics/opening_prompt.md | 2 +- evals/{evaluation => metrics}/auto_eval/prompt_engineering.py | 0 evals/{evaluation => metrics}/auto_eval/rag_dataset.py | 0 evals/{evaluation => metrics}/auto_eval/run_eval.py | 0 evals/{evaluation => metrics}/auto_eval/utils/__init__.py | 0 evals/{evaluation => metrics}/auto_eval/utils/helper.py | 0 evals/{evaluation => metrics}/auto_eval/utils/model.py | 0 evals/{evaluation => metrics}/auto_eval/utils/retry.py | 0 15 files changed, 1 insertion(+), 1 deletion(-) rename evals/{evaluation => metrics}/auto_eval/.env (100%) rename evals/{evaluation => metrics}/auto_eval/README.md (100%) rename evals/{evaluation => metrics}/auto_eval/__init__.py (100%) rename evals/{evaluation => metrics}/auto_eval/auto_eval_metrics/metric_prompt_templates/correctness_prompt.md (100%) rename evals/{evaluation => metrics}/auto_eval/auto_eval_metrics/metric_prompt_templates/factualness_prompt.md (100%) rename evals/{evaluation => metrics}/auto_eval/auto_eval_metrics/metric_prompt_templates/readability_prompt.md (100%) rename evals/{evaluation => metrics}/auto_eval/auto_eval_metrics/metric_prompt_templates/relevance_prompt.md (100%) rename evals/{evaluation => metrics}/auto_eval/auto_eval_metrics/opening_prompt.md (88%) rename evals/{evaluation => metrics}/auto_eval/prompt_engineering.py (100%) rename evals/{evaluation => metrics}/auto_eval/rag_dataset.py (100%) rename evals/{evaluation => metrics}/auto_eval/run_eval.py (100%) rename evals/{evaluation => metrics}/auto_eval/utils/__init__.py (100%) rename evals/{evaluation => metrics}/auto_eval/utils/helper.py (100%) rename evals/{evaluation => metrics}/auto_eval/utils/model.py (100%) rename evals/{evaluation => metrics}/auto_eval/utils/retry.py (100%) diff --git a/evals/evaluation/auto_eval/.env b/evals/metrics/auto_eval/.env similarity index 100% rename from evals/evaluation/auto_eval/.env rename to evals/metrics/auto_eval/.env diff --git a/evals/evaluation/auto_eval/README.md b/evals/metrics/auto_eval/README.md similarity index 100% rename from evals/evaluation/auto_eval/README.md rename to evals/metrics/auto_eval/README.md diff --git a/evals/evaluation/auto_eval/__init__.py b/evals/metrics/auto_eval/__init__.py similarity index 100% rename from evals/evaluation/auto_eval/__init__.py rename to evals/metrics/auto_eval/__init__.py diff --git a/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/correctness_prompt.md b/evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/correctness_prompt.md similarity index 100% rename from evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/correctness_prompt.md rename to evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/correctness_prompt.md diff --git a/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/factualness_prompt.md b/evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/factualness_prompt.md similarity index 100% rename from evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/factualness_prompt.md rename to evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/factualness_prompt.md diff --git a/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/readability_prompt.md b/evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/readability_prompt.md similarity index 100% rename from evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/readability_prompt.md rename to evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/readability_prompt.md diff --git a/evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/relevance_prompt.md b/evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/relevance_prompt.md similarity index 100% rename from evals/evaluation/auto_eval/auto_eval_metrics/metric_prompt_templates/relevance_prompt.md rename to evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/relevance_prompt.md diff --git a/evals/evaluation/auto_eval/auto_eval_metrics/opening_prompt.md b/evals/metrics/auto_eval/auto_eval_metrics/opening_prompt.md similarity index 88% rename from evals/evaluation/auto_eval/auto_eval_metrics/opening_prompt.md rename to evals/metrics/auto_eval/auto_eval_metrics/opening_prompt.md index 72e54a3e..0c03a43d 100644 --- a/evals/evaluation/auto_eval/auto_eval_metrics/opening_prompt.md +++ b/evals/metrics/auto_eval/auto_eval_metrics/opening_prompt.md @@ -1,4 +1,4 @@ -Consider yourself as an engineer working at cnvrg.io which is a Full Stack Machine Learning Operating System owned by Intel. +Consider yourself as an helpful, truthful and impartial judge. Your task: You will be given an input consisting of a question, an answer and a context. Your task is to act as an impartial judge and provide a numerical score between 1 to 5 for each of the following metrics for the given answer. diff --git a/evals/evaluation/auto_eval/prompt_engineering.py b/evals/metrics/auto_eval/prompt_engineering.py similarity index 100% rename from evals/evaluation/auto_eval/prompt_engineering.py rename to evals/metrics/auto_eval/prompt_engineering.py diff --git a/evals/evaluation/auto_eval/rag_dataset.py b/evals/metrics/auto_eval/rag_dataset.py similarity index 100% rename from evals/evaluation/auto_eval/rag_dataset.py rename to evals/metrics/auto_eval/rag_dataset.py diff --git a/evals/evaluation/auto_eval/run_eval.py b/evals/metrics/auto_eval/run_eval.py similarity index 100% rename from evals/evaluation/auto_eval/run_eval.py rename to evals/metrics/auto_eval/run_eval.py diff --git a/evals/evaluation/auto_eval/utils/__init__.py b/evals/metrics/auto_eval/utils/__init__.py similarity index 100% rename from evals/evaluation/auto_eval/utils/__init__.py rename to evals/metrics/auto_eval/utils/__init__.py diff --git a/evals/evaluation/auto_eval/utils/helper.py b/evals/metrics/auto_eval/utils/helper.py similarity index 100% rename from evals/evaluation/auto_eval/utils/helper.py rename to evals/metrics/auto_eval/utils/helper.py diff --git a/evals/evaluation/auto_eval/utils/model.py b/evals/metrics/auto_eval/utils/model.py similarity index 100% rename from evals/evaluation/auto_eval/utils/model.py rename to evals/metrics/auto_eval/utils/model.py diff --git a/evals/evaluation/auto_eval/utils/retry.py b/evals/metrics/auto_eval/utils/retry.py similarity index 100% rename from evals/evaluation/auto_eval/utils/retry.py rename to evals/metrics/auto_eval/utils/retry.py From f134058083ee7182cb53093ca234e48386019f42 Mon Sep 17 00:00:00 2001 From: aasavari Date: Thu, 26 Sep 2024 22:20:18 +0000 Subject: [PATCH 20/32] testing of auto-eval with Llama 3.2 successful Signed-off-by: aasavari --- evals/metrics/auto_eval/README.md | 2 +- tests/test_auto_eval.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/evals/metrics/auto_eval/README.md b/evals/metrics/auto_eval/README.md index c260a17e..81c8cd15 100644 --- a/evals/metrics/auto_eval/README.md +++ b/evals/metrics/auto_eval/README.md @@ -13,7 +13,7 @@ AutoEval can run in 3 evaluation modes - - To launch HF endpoint on Gaudi2, please follow the 2-step instructions here - [tgi-gaudi](https://github.com/huggingface/tgi-gaudi). - Pass your endpoint url as `model_name` argument. 2. `evaluation_mode="openai"` uses openai backend. -- Please set your `OPEN_API_KEY` and your choice of model as `model_name` argument. +- Please set your `openai_key` and your choice of model as `model_name` argument. 3. `evaluation_mode="local"` uses your local hardware. - Set `hf_token` argument and set your favourite open-source model in `model_name` argument. - GPU usage will be prioritized after checking it's availability. If GPU is unavailable, the model will run on CPU. diff --git a/tests/test_auto_eval.py b/tests/test_auto_eval.py index 90ab55eb..f85914c1 100644 --- a/tests/test_auto_eval.py +++ b/tests/test_auto_eval.py @@ -7,7 +7,7 @@ import os import unittest -from evals.evaluation.auto_eval import AutoEvaluate +from evals.metrics.auto_eval import AutoEvaluate host_ip = os.getenv("host_ip", "localhost") port = os.getenv("port", "8008") From 3ca8ecb5c34e99d96198b5ea78a8935f6c9a91cc Mon Sep 17 00:00:00 2001 From: aasavari Date: Thu, 10 Oct 2024 19:48:58 -0700 Subject: [PATCH 21/32] removed .env loading and modularized metric templates --- evals/metrics/auto_eval/prompt_engineering.py | 40 ++++-------- .../auto_eval/prompt_templates/__init__.py | 26 ++++++++ .../correctness.py} | 10 ++- .../factualness.py} | 10 ++- .../opening_prompt.py} | 11 +++- .../readability.py} | 10 ++- .../relevance.py} | 10 ++- evals/metrics/auto_eval/run_eval.py | 63 ++++++++++++------- evals/metrics/auto_eval/utils/model.py | 2 +- tests/test_auto_eval.py | 11 +++- 10 files changed, 132 insertions(+), 61 deletions(-) create mode 100644 evals/metrics/auto_eval/prompt_templates/__init__.py rename evals/metrics/auto_eval/{auto_eval_metrics/metric_prompt_templates/correctness_prompt.md => prompt_templates/correctness.py} (67%) rename evals/metrics/auto_eval/{auto_eval_metrics/metric_prompt_templates/factualness_prompt.md => prompt_templates/factualness.py} (61%) rename evals/metrics/auto_eval/{auto_eval_metrics/opening_prompt.md => prompt_templates/opening_prompt.py} (75%) rename evals/metrics/auto_eval/{auto_eval_metrics/metric_prompt_templates/readability_prompt.md => prompt_templates/readability.py} (57%) rename evals/metrics/auto_eval/{auto_eval_metrics/metric_prompt_templates/relevance_prompt.md => prompt_templates/relevance.py} (78%) diff --git a/evals/metrics/auto_eval/prompt_engineering.py b/evals/metrics/auto_eval/prompt_engineering.py index 31d77a37..17bebda3 100644 --- a/evals/metrics/auto_eval/prompt_engineering.py +++ b/evals/metrics/auto_eval/prompt_engineering.py @@ -1,27 +1,19 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import os - -from dotenv import load_dotenv -from jinja2 import Environment, FileSystemLoader, Template +from jinja2 import Template +from .prompt_templates import NAME2METRIC +from .prompt_templates import * class Prompt: """Class to customize prompt template using user-defined list of metrics.""" - def __init__(self, metrics, input_fields, prompt_dir): - self.metrics = metrics + def __init__(self, metrics, input_fields): + self.metrics = metrics self.input_fields = input_fields - self.define_template_paths(prompt_dir) self.template = self.load_prompt_template() - def define_template_paths(self, prompt_dir): - self.opening_prompt_path = os.path.join(prompt_dir, "opening_prompt.md") - metric_prompt_names = ["{}_prompt.md".format(metric) for metric in self.metrics] - local_metric_prompt_paths = [os.path.join("metric_prompt_templates", m) for m in metric_prompt_names] - self.metric_prompt_paths = [os.path.join(prompt_dir, p) for p in local_metric_prompt_paths] - def create_grading_format(self): grading_format = ( "You must ALWAYS provide every single one of the scores and reasonings in the following JSON format:" @@ -44,16 +36,11 @@ def create_closing_prompt(self): closing_prompt += ("Provided {}:".format(f) + "\n" + "{{" + f + "}}",) return "\n\n".join(closing_prompt) - @staticmethod - def load_template(template_path): - dir = os.path.dirname(os.path.abspath(__file__)) - env = Environment(loader=FileSystemLoader(dir)) - return env.get_template(template_path) - def load_prompt_template(self): - content = [self.load_template(self.opening_prompt_path).render()] - for path in self.metric_prompt_paths: - content += (self.load_template(path).render(),) + content = [] + for metric_name in ["opening_prompt"] + self.metrics: + metric_instance = NAME2METRIC[metric_name] + content += metric_instance.template, content += (self.create_grading_format(),) content += (self.create_closing_prompt(),) return Template("\n\n".join(content)) @@ -70,13 +57,9 @@ def render_prompt(self, **kwargs) -> str: # step 0 - user input metrics = ["factualness", "relevance", "correctness", "readability"] input_fields = ["question", "answer", "context"] - prompt_dir = "./auto_eval_metrics/" - - # step 1 - load jinja2 environment - load_dotenv(os.path.join(os.path.dirname(__file__), ".env"), override=True) - # step 2 - load prompt using Prompt class - prompt = Prompt(metrics=metrics, input_fields=input_fields, prompt_dir=prompt_dir) + # step 1 - load prompt using Prompt class + prompt = Prompt(metrics=metrics, input_fields=input_fields) example = { "question": "Who is wife of Barak Obama", @@ -85,6 +68,7 @@ def render_prompt(self, **kwargs) -> str: "ground_truth": "Wife of Barak Obama is Michelle Obama", } + # step 2 - render prompt with given inputs rendered_prompt = prompt.render_prompt( question=example["question"], answer=example["answer"], context=example["context"] ) diff --git a/evals/metrics/auto_eval/prompt_templates/__init__.py b/evals/metrics/auto_eval/prompt_templates/__init__.py new file mode 100644 index 00000000..0c49b0a6 --- /dev/null +++ b/evals/metrics/auto_eval/prompt_templates/__init__.py @@ -0,0 +1,26 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from .opening_prompt import OpeningPrompt + +from .correctness import Correctness +from .factualness import Factualness +from .relevance import Relevance +from .readability import Readability + +__all__ = [ + "opening_prompt", + "correctness", + "factualness", + "relevance", + "readability" +] + +NAME2METRIC = {} + +def snake2camel(s): + return ''.join(x.capitalize() or '_' for x in s.split('_')) + +for name in __all__: + NAME2METRIC[name] = eval(snake2camel(name)) + \ No newline at end of file diff --git a/evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/correctness_prompt.md b/evals/metrics/auto_eval/prompt_templates/correctness.py similarity index 67% rename from evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/correctness_prompt.md rename to evals/metrics/auto_eval/prompt_templates/correctness.py index 6cf9ba3e..52afc089 100644 --- a/evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/correctness_prompt.md +++ b/evals/metrics/auto_eval/prompt_templates/correctness.py @@ -1,6 +1,12 @@ -- Correctness: correctness measures how accurately and comprehensively does the answer resolve problem posed in the question. +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +class Correctness: + name = "correctness" + required_columns = ['answer', 'context', 'question'] + template = """- Correctness: correctness measures how accurately and comprehensively does the answer resolve problem posed in the question. - Score 1: If the answer is empty string or something like "I do not know the answer", the correctness score is 1. - Score 2: If the answer only addresses a small part of the question correctly or it is missing many critical steps/aspects of the answer or the answer is too short to fully answer the question or is missing many steps causing the answer to not fully address the problem described in the question, then the correctness score is 2. - Score 3: The answer mostly addresses the question but one critical aspect/step is missing or is incorrect. - Score 4: the answer mostly answer the question and covers all critical/main aspects of the question, but itโ€™s missing important/necessary details about one or more aspects. - - Score 5: the answer correctly and completely addresses the query. It also covers important details about each step. + - Score 5: the answer correctly and completely addresses the query. It also covers important details about each step.""" \ No newline at end of file diff --git a/evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/factualness_prompt.md b/evals/metrics/auto_eval/prompt_templates/factualness.py similarity index 61% rename from evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/factualness_prompt.md rename to evals/metrics/auto_eval/prompt_templates/factualness.py index dd3bdfaf..15386f6c 100644 --- a/evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/factualness_prompt.md +++ b/evals/metrics/auto_eval/prompt_templates/factualness.py @@ -1,6 +1,12 @@ -- Factualness: Factualness assesses how much of the provided answer is contained within the provided context. A higher score indicates that a higher proportion of claims present in the answer are present or can be derived from the provided context. +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +class Factualness: + name = "factualness" + required_columns = ['answer', 'context'] + template = """- Factualness: Factualness assesses how much of the provided answer is contained within the provided context. A higher score indicates that a higher proportion of claims present in the answer are present or can be derived from the provided context. - Score 1: the answer is completely hallucinated i.e. not contained in the context at all or there is no answer. - Score 2: only a small part of the answer is contained in the context but most of it is imaginary/hallucinated or the meaning is completely changed from what is represented in the context. - Score 3: Only about half of the answer is contained in the context. Rest of the answer is hallucinated or imaginary. - Score 4: Most of the claims in the answer can be inferred from the provided context with very little information that is not directly supported by the provided context. - - Score 5: All of the claims in the answer are directly supported by the provided context, demonstrating high faithfulness to the provided context. + - Score 5: All of the claims in the answer are directly supported by the provided context, demonstrating high faithfulness to the provided context.""" \ No newline at end of file diff --git a/evals/metrics/auto_eval/auto_eval_metrics/opening_prompt.md b/evals/metrics/auto_eval/prompt_templates/opening_prompt.py similarity index 75% rename from evals/metrics/auto_eval/auto_eval_metrics/opening_prompt.md rename to evals/metrics/auto_eval/prompt_templates/opening_prompt.py index 0c03a43d..c81c243d 100644 --- a/evals/metrics/auto_eval/auto_eval_metrics/opening_prompt.md +++ b/evals/metrics/auto_eval/prompt_templates/opening_prompt.py @@ -1,4 +1,11 @@ -Consider yourself as an helpful, truthful and impartial judge. +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +class OpeningPrompt: + name = "opening_prompt" + required_columns = [] + + template = """Consider yourself as an helpful, truthful and impartial judge. Your task: You will be given an input consisting of a question, an answer and a context. Your task is to act as an impartial judge and provide a numerical score between 1 to 5 for each of the following metrics for the given answer. @@ -10,4 +17,4 @@ 4. Base your grading decision only on the given inputs and do not speculate or hallucinate. 5. You must also provide reasoning for your score in a single sentence. -Your metric definitions along with grading scale and rubric: +Your metric definitions along with grading scale and rubric:""" diff --git a/evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/readability_prompt.md b/evals/metrics/auto_eval/prompt_templates/readability.py similarity index 57% rename from evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/readability_prompt.md rename to evals/metrics/auto_eval/prompt_templates/readability.py index a059ff0f..684464b9 100644 --- a/evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/readability_prompt.md +++ b/evals/metrics/auto_eval/prompt_templates/readability.py @@ -1,6 +1,12 @@ -- Readability: Readability measures clarity and lucidity of the answer. Readability is measured solely based on the answer and it does not consider the question or the context. +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +class Readability: + name = "readability" + required_columns = ['answer'] + template = """- Readability: Readability measures clarity and lucidity of the answer. Readability is measured solely based on the answer and it does not consider the question or the context. - Score 1: the answer is empty or "I do not know the answer" or completely unreadable or No meaningful information can be extracted from the answer, then the score is 1. - Score 2: the answer is slightly readable, there are irrelevant symbols or HTML tags or repeated words, but it can roughly form a meaningful sentence that can cover some aspects of the answer. - Score 3: Answer can be read but there are grammatical mistakes in the answer. - Score 4: the answer readable, but the readability and style can improved to better appeal to the reader. - - Score 5: the answer is reader friendly and well written. + - Score 5: the answer is reader friendly and well written.""" \ No newline at end of file diff --git a/evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/relevance_prompt.md b/evals/metrics/auto_eval/prompt_templates/relevance.py similarity index 78% rename from evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/relevance_prompt.md rename to evals/metrics/auto_eval/prompt_templates/relevance.py index 9a009de3..1d7c7a76 100644 --- a/evals/metrics/auto_eval/auto_eval_metrics/metric_prompt_templates/relevance_prompt.md +++ b/evals/metrics/auto_eval/prompt_templates/relevance.py @@ -1,6 +1,12 @@ -- Relevance: Relevance measures how well the answer relates to the question. +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +class Relevance: + name = "relevance" + required_columns = ['question', 'answer'] + template = """- Relevance: Relevance measures how well the answer relates to the question. - Score 1: The answer doesn't mention anything about the question or is completely irrelevant to the question. - Score 2: The answer only identifies the domain (e.g. cnvrg) mentioned in the question and provides information from the correct domain. But, the answer does not address the question itself and the point of the question is completely missed by it. - Score 3: The answer correctly identifies the domain and essence of the question but the details in the answer are not relevant to the focus of the question. - Score 4: The answer correctly identifies domain mentioned the question and essence of the question as well as stays consistent with both of them. But there is some part of the answer that is not relevant to the question or it's topic or it's essence. This irrelevant part is damaging the overall relevance of the answer. - - Score 5: The answer is completely relevant to the question and the details do not deviate from the essence of the question. There are no parts of the answer that are irrelevant or unnecessary for the given question. + - Score 5: The answer is completely relevant to the question and the details do not deviate from the essence of the question. There are no parts of the answer that are irrelevant or unnecessary for the given question.""" \ No newline at end of file diff --git a/evals/metrics/auto_eval/run_eval.py b/evals/metrics/auto_eval/run_eval.py index 2511c787..9d35c2aa 100644 --- a/evals/metrics/auto_eval/run_eval.py +++ b/evals/metrics/auto_eval/run_eval.py @@ -1,16 +1,8 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import argparse -import ast -import json -import os import time - -import pandas as pd -from dotenv import load_dotenv from huggingface_hub import login -from jinja2 import Environment, FileSystemLoader from .prompt_engineering import Prompt from .rag_dataset import RAGDataset @@ -37,37 +29,29 @@ def __init__( "endpoint": {"max_tokens": 500}, "local": {"max_new_tokens": 500}, } - self.load_env() self.data = RAGDataset(dataset=dataset, field_map=field_map, mode=data_mode) self.evaluator = self.get_evaluator(evaluation_mode, model_name, openai_key, hf_token) self.prompt_template = self.get_template(evaluation_metrics, field_map) self.debug_mode = debug_mode self.generation_config = self.GENERATION_CONFIG[evaluation_mode] - def load_env( - self, - ): - dot_env_path = os.path.join(os.path.dirname(__file__), ".env") - print("Loading dot environment from {}".format(dot_env_path)) - load_dotenv(dot_env_path, override=True) - def get_evaluator(self, evaluation_mode, model_name, openai_key=None, hf_token=None): if evaluation_mode == "openai": - # assert args.model_name in ALLOWED_OPENAI_MODELS, "please provide a openai model from the given list of allowed models" print("Using {} openai key".format(openai_key)) evaluator = OAIEvaluator(openai_key, model_name) elif evaluation_mode == "endpoint": print("Loading HF endpoint at {}".format(model_name)) evaluator = EndpointEvaluator(model_name) else: - assert args.evaluation_mode == "local", "evaluation mode must be openai / endpoint / local" + assert evaluation_mode == "local", "evaluation mode must be openai / endpoint / local" print("Loading {} model locally".format(model_name)) - login(token=hf_token) - evaluator = HFEvaluator(args.model_name) + login(token=hf_token, add_to_git_credential=True) + evaluator = HFEvaluator(model_name) return evaluator def get_template(self, evaluation_metrics, field_map): - return Prompt(metrics=evaluation_metrics, input_fields=field_map, prompt_dir="./auto_eval_metrics").template + prompt = Prompt(metrics=evaluation_metrics, input_fields=field_map) + return prompt.template def measure(self): n_samples = 1 if self.debug_mode else len(self.data) @@ -86,3 +70,40 @@ def measure(self): end = time.time() print("Generation of scores and reasoning took {:.2f} seconds for {:,} examples".format(end - start, n_samples)) return responses + +if __name__ == "__main__": + + dataset = "explodinggradients/ragas-wikiqa" + data_mode = "benchmarking" + field_map = {"question": "question", "answer": "generated_with_rag", "context": "context"} + + # evaluation_mode = "endpoint" + # model_name = f"http://{host_ip}:{port}" + + evaluation_mode = "openai" + openai_key = "" + model_name="gpt-4o" + + evaluation_metrics = ["factualness", "relevance", "correctness", "readability"] + + evaluator = AutoEvaluate( + dataset=dataset, + data_mode=data_mode, + field_map=field_map, + evaluation_mode=evaluation_mode, + model_name=model_name, + evaluation_metrics=evaluation_metrics, + openai_key=openai_key, + debug_mode=True, + ) + + responses = evaluator.measure() + + for response in responses: + print(response) + + + + + + \ No newline at end of file diff --git a/evals/metrics/auto_eval/utils/model.py b/evals/metrics/auto_eval/utils/model.py index b0358cba..1c46d959 100644 --- a/evals/metrics/auto_eval/utils/model.py +++ b/evals/metrics/auto_eval/utils/model.py @@ -6,7 +6,7 @@ import openai import torch from huggingface_hub import InferenceClient -from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, TextStreamer, pipeline +from transformers import AutoTokenizer, pipeline from .helper import extract_delay_from_rate_limit_error_msg from .retry import retry_and_handle_exceptions diff --git a/tests/test_auto_eval.py b/tests/test_auto_eval.py index f85914c1..cc5554f7 100644 --- a/tests/test_auto_eval.py +++ b/tests/test_auto_eval.py @@ -3,7 +3,6 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 - import os import unittest @@ -22,9 +21,17 @@ def test_ragas(self): data_mode = "benchmarking" field_map = {"question": "question", "answer": "generated_with_rag", "context": "context"} + # evaluation_mode = "openai" + # model_name = "gpt-4o" + # openai_key = "" + evaluation_mode = "endpoint" model_name = f"http://{host_ip}:{port}" + # evaluation_mode = "local" + # model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1" + # hf_token = "" + evaluation_metrics = ["factualness", "relevance", "correctness", "readability"] evaluator = AutoEvaluate( @@ -34,6 +41,8 @@ def test_ragas(self): evaluation_mode=evaluation_mode, model_name=model_name, evaluation_metrics=evaluation_metrics, + # openai_key=openai_key, + # hf_token=hf_token, debug_mode=True, ) From 9157b83936e3064768683d2c4f7dd904a2689d1f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 11 Oct 2024 02:55:05 +0000 Subject: [PATCH 22/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- evals/metrics/auto_eval/prompt_engineering.py | 9 +++++---- .../metrics/auto_eval/prompt_templates/__init__.py | 13 ++++--------- .../auto_eval/prompt_templates/correctness.py | 11 ++++++----- .../auto_eval/prompt_templates/factualness.py | 13 +++++++------ .../auto_eval/prompt_templates/opening_prompt.py | 11 ++++++----- .../auto_eval/prompt_templates/readability.py | 11 ++++++----- .../metrics/auto_eval/prompt_templates/relevance.py | 11 ++++++----- evals/metrics/auto_eval/run_eval.py | 10 +++------- tests/requirements.txt | 2 +- 9 files changed, 44 insertions(+), 47 deletions(-) diff --git a/evals/metrics/auto_eval/prompt_engineering.py b/evals/metrics/auto_eval/prompt_engineering.py index 17bebda3..3ab6e7e1 100644 --- a/evals/metrics/auto_eval/prompt_engineering.py +++ b/evals/metrics/auto_eval/prompt_engineering.py @@ -2,15 +2,16 @@ # SPDX-License-Identifier: Apache-2.0 from jinja2 import Template -from .prompt_templates import NAME2METRIC + from .prompt_templates import * +from .prompt_templates import NAME2METRIC class Prompt: """Class to customize prompt template using user-defined list of metrics.""" def __init__(self, metrics, input_fields): - self.metrics = metrics + self.metrics = metrics self.input_fields = input_fields self.template = self.load_prompt_template() @@ -37,10 +38,10 @@ def create_closing_prompt(self): return "\n\n".join(closing_prompt) def load_prompt_template(self): - content = [] + content = [] for metric_name in ["opening_prompt"] + self.metrics: metric_instance = NAME2METRIC[metric_name] - content += metric_instance.template, + content += (metric_instance.template,) content += (self.create_grading_format(),) content += (self.create_closing_prompt(),) return Template("\n\n".join(content)) diff --git a/evals/metrics/auto_eval/prompt_templates/__init__.py b/evals/metrics/auto_eval/prompt_templates/__init__.py index 0c49b0a6..2b3979ba 100644 --- a/evals/metrics/auto_eval/prompt_templates/__init__.py +++ b/evals/metrics/auto_eval/prompt_templates/__init__.py @@ -8,19 +8,14 @@ from .relevance import Relevance from .readability import Readability -__all__ = [ - "opening_prompt", - "correctness", - "factualness", - "relevance", - "readability" -] +__all__ = ["opening_prompt", "correctness", "factualness", "relevance", "readability"] NAME2METRIC = {} + def snake2camel(s): - return ''.join(x.capitalize() or '_' for x in s.split('_')) + return "".join(x.capitalize() or "_" for x in s.split("_")) + for name in __all__: NAME2METRIC[name] = eval(snake2camel(name)) - \ No newline at end of file diff --git a/evals/metrics/auto_eval/prompt_templates/correctness.py b/evals/metrics/auto_eval/prompt_templates/correctness.py index 52afc089..a328d3d2 100644 --- a/evals/metrics/auto_eval/prompt_templates/correctness.py +++ b/evals/metrics/auto_eval/prompt_templates/correctness.py @@ -1,12 +1,13 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + class Correctness: name = "correctness" - required_columns = ['answer', 'context', 'question'] - template = """- Correctness: correctness measures how accurately and comprehensively does the answer resolve problem posed in the question. - - Score 1: If the answer is empty string or something like "I do not know the answer", the correctness score is 1. + required_columns = ["answer", "context", "question"] + template = """- Correctness: correctness measures how accurately and comprehensively does the answer resolve problem posed in the question. + - Score 1: If the answer is empty string or something like "I do not know the answer", the correctness score is 1. - Score 2: If the answer only addresses a small part of the question correctly or it is missing many critical steps/aspects of the answer or the answer is too short to fully answer the question or is missing many steps causing the answer to not fully address the problem described in the question, then the correctness score is 2. - - Score 3: The answer mostly addresses the question but one critical aspect/step is missing or is incorrect. + - Score 3: The answer mostly addresses the question but one critical aspect/step is missing or is incorrect. - Score 4: the answer mostly answer the question and covers all critical/main aspects of the question, but itโ€™s missing important/necessary details about one or more aspects. - - Score 5: the answer correctly and completely addresses the query. It also covers important details about each step.""" \ No newline at end of file + - Score 5: the answer correctly and completely addresses the query. It also covers important details about each step.""" diff --git a/evals/metrics/auto_eval/prompt_templates/factualness.py b/evals/metrics/auto_eval/prompt_templates/factualness.py index 15386f6c..7fa6dfee 100644 --- a/evals/metrics/auto_eval/prompt_templates/factualness.py +++ b/evals/metrics/auto_eval/prompt_templates/factualness.py @@ -1,12 +1,13 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + class Factualness: name = "factualness" - required_columns = ['answer', 'context'] - template = """- Factualness: Factualness assesses how much of the provided answer is contained within the provided context. A higher score indicates that a higher proportion of claims present in the answer are present or can be derived from the provided context. - - Score 1: the answer is completely hallucinated i.e. not contained in the context at all or there is no answer. + required_columns = ["answer", "context"] + template = """- Factualness: Factualness assesses how much of the provided answer is contained within the provided context. A higher score indicates that a higher proportion of claims present in the answer are present or can be derived from the provided context. + - Score 1: the answer is completely hallucinated i.e. not contained in the context at all or there is no answer. - Score 2: only a small part of the answer is contained in the context but most of it is imaginary/hallucinated or the meaning is completely changed from what is represented in the context. - - Score 3: Only about half of the answer is contained in the context. Rest of the answer is hallucinated or imaginary. - - Score 4: Most of the claims in the answer can be inferred from the provided context with very little information that is not directly supported by the provided context. - - Score 5: All of the claims in the answer are directly supported by the provided context, demonstrating high faithfulness to the provided context.""" \ No newline at end of file + - Score 3: Only about half of the answer is contained in the context. Rest of the answer is hallucinated or imaginary. + - Score 4: Most of the claims in the answer can be inferred from the provided context with very little information that is not directly supported by the provided context. + - Score 5: All of the claims in the answer are directly supported by the provided context, demonstrating high faithfulness to the provided context.""" diff --git a/evals/metrics/auto_eval/prompt_templates/opening_prompt.py b/evals/metrics/auto_eval/prompt_templates/opening_prompt.py index c81c243d..441f371e 100644 --- a/evals/metrics/auto_eval/prompt_templates/opening_prompt.py +++ b/evals/metrics/auto_eval/prompt_templates/opening_prompt.py @@ -1,6 +1,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + class OpeningPrompt: name = "opening_prompt" required_columns = [] @@ -8,13 +9,13 @@ class OpeningPrompt: template = """Consider yourself as an helpful, truthful and impartial judge. Your task: -You will be given an input consisting of a question, an answer and a context. Your task is to act as an impartial judge and provide a numerical score between 1 to 5 for each of the following metrics for the given answer. +You will be given an input consisting of a question, an answer and a context. Your task is to act as an impartial judge and provide a numerical score between 1 to 5 for each of the following metrics for the given answer. Important rules for you while completing this task: -1. You MUST ALWAYS provide a score for every metric mentioned below. +1. You MUST ALWAYS provide a score for every metric mentioned below. 2. Make sure to understand definition of every metric fully before completing your task. Every metric is provided with grading scale and rubric. You MUST use this grading scale and rubric to determine your score. -3. Ensure that your scores and reasoning for every metric is independent of each other e.g., score for factualness should not impact score for correctness and vice versa. -4. Base your grading decision only on the given inputs and do not speculate or hallucinate. -5. You must also provide reasoning for your score in a single sentence. +3. Ensure that your scores and reasoning for every metric is independent of each other e.g., score for factualness should not impact score for correctness and vice versa. +4. Base your grading decision only on the given inputs and do not speculate or hallucinate. +5. You must also provide reasoning for your score in a single sentence. Your metric definitions along with grading scale and rubric:""" diff --git a/evals/metrics/auto_eval/prompt_templates/readability.py b/evals/metrics/auto_eval/prompt_templates/readability.py index 684464b9..4c03e6e7 100644 --- a/evals/metrics/auto_eval/prompt_templates/readability.py +++ b/evals/metrics/auto_eval/prompt_templates/readability.py @@ -1,12 +1,13 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + class Readability: name = "readability" - required_columns = ['answer'] - template = """- Readability: Readability measures clarity and lucidity of the answer. Readability is measured solely based on the answer and it does not consider the question or the context. - - Score 1: the answer is empty or "I do not know the answer" or completely unreadable or No meaningful information can be extracted from the answer, then the score is 1. + required_columns = ["answer"] + template = """- Readability: Readability measures clarity and lucidity of the answer. Readability is measured solely based on the answer and it does not consider the question or the context. + - Score 1: the answer is empty or "I do not know the answer" or completely unreadable or No meaningful information can be extracted from the answer, then the score is 1. - Score 2: the answer is slightly readable, there are irrelevant symbols or HTML tags or repeated words, but it can roughly form a meaningful sentence that can cover some aspects of the answer. - Score 3: Answer can be read but there are grammatical mistakes in the answer. - - Score 4: the answer readable, but the readability and style can improved to better appeal to the reader. - - Score 5: the answer is reader friendly and well written.""" \ No newline at end of file + - Score 4: the answer readable, but the readability and style can improved to better appeal to the reader. + - Score 5: the answer is reader friendly and well written.""" diff --git a/evals/metrics/auto_eval/prompt_templates/relevance.py b/evals/metrics/auto_eval/prompt_templates/relevance.py index 1d7c7a76..33743ecc 100644 --- a/evals/metrics/auto_eval/prompt_templates/relevance.py +++ b/evals/metrics/auto_eval/prompt_templates/relevance.py @@ -1,12 +1,13 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + class Relevance: name = "relevance" - required_columns = ['question', 'answer'] - template = """- Relevance: Relevance measures how well the answer relates to the question. + required_columns = ["question", "answer"] + template = """- Relevance: Relevance measures how well the answer relates to the question. - Score 1: The answer doesn't mention anything about the question or is completely irrelevant to the question. - - Score 2: The answer only identifies the domain (e.g. cnvrg) mentioned in the question and provides information from the correct domain. But, the answer does not address the question itself and the point of the question is completely missed by it. + - Score 2: The answer only identifies the domain (e.g. cnvrg) mentioned in the question and provides information from the correct domain. But, the answer does not address the question itself and the point of the question is completely missed by it. - Score 3: The answer correctly identifies the domain and essence of the question but the details in the answer are not relevant to the focus of the question. - - Score 4: The answer correctly identifies domain mentioned the question and essence of the question as well as stays consistent with both of them. But there is some part of the answer that is not relevant to the question or it's topic or it's essence. This irrelevant part is damaging the overall relevance of the answer. - - Score 5: The answer is completely relevant to the question and the details do not deviate from the essence of the question. There are no parts of the answer that are irrelevant or unnecessary for the given question.""" \ No newline at end of file + - Score 4: The answer correctly identifies domain mentioned the question and essence of the question as well as stays consistent with both of them. But there is some part of the answer that is not relevant to the question or it's topic or it's essence. This irrelevant part is damaging the overall relevance of the answer. + - Score 5: The answer is completely relevant to the question and the details do not deviate from the essence of the question. There are no parts of the answer that are irrelevant or unnecessary for the given question.""" diff --git a/evals/metrics/auto_eval/run_eval.py b/evals/metrics/auto_eval/run_eval.py index 9d35c2aa..affc9f3c 100644 --- a/evals/metrics/auto_eval/run_eval.py +++ b/evals/metrics/auto_eval/run_eval.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import time + from huggingface_hub import login from .prompt_engineering import Prompt @@ -71,6 +72,7 @@ def measure(self): print("Generation of scores and reasoning took {:.2f} seconds for {:,} examples".format(end - start, n_samples)) return responses + if __name__ == "__main__": dataset = "explodinggradients/ragas-wikiqa" @@ -82,7 +84,7 @@ def measure(self): evaluation_mode = "openai" openai_key = "" - model_name="gpt-4o" + model_name = "gpt-4o" evaluation_metrics = ["factualness", "relevance", "correctness", "readability"] @@ -101,9 +103,3 @@ def measure(self): for response in responses: print(response) - - - - - - \ No newline at end of file diff --git a/tests/requirements.txt b/tests/requirements.txt index 1bbc3cad..3f809f3e 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -6,4 +6,4 @@ langchain_community langchain_huggingface lm-eval==0.4.3 openai -ragas==0.1.19 \ No newline at end of file +ragas==0.1.19 From e9c915a7a738b2b46196dfd6e3efe31436c25b3d Mon Sep 17 00:00:00 2001 From: aasavari Date: Thu, 10 Oct 2024 21:16:09 -0700 Subject: [PATCH 23/32] merging requirements Signed-off-by: aasavari --- evals/metrics/ragas/ragas.py | 107 +++++++++++++++++++++-------------- tests/requirements.txt | 2 +- tests/test_ragas.py | 3 + 3 files changed, 68 insertions(+), 44 deletions(-) diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py index 9b0a1d3e..c80ff94e 100644 --- a/evals/metrics/ragas/ragas.py +++ b/evals/metrics/ragas/ragas.py @@ -4,12 +4,16 @@ # SPDX-License-Identifier: Apache-2.0 # import os +import re from typing import Dict, Optional, Union from langchain_core.embeddings import Embeddings from langchain_core.language_models import BaseLanguageModel from langchain_huggingface import HuggingFaceEndpoint +# import * is only allowed at module level according to python syntax +from ragas.metrics import * + def format_ragas_metric_name(name: str): return f"{name} (ragas)" @@ -29,16 +33,17 @@ def __init__( self.model = model self.embeddings = embeddings self.metrics = metrics - self.validated_list = [ - "answer_correctness", - "answer_relevancy", - "answer_similarity", - "context_precision", - "context_recall", - "faithfulness", - "context_utilization", - # "reference_free_rubrics_score", - ] + + # self.validated_list = [ + # "answer_correctness", + # "answer_relevancy", + # "answer_similarity", + # "context_precision", + # "context_recall", + # "faithfulness", + # "context_utilization", + # # "reference_free_rubrics_score", + # ] async def a_measure(self, test_case: Dict): return self.measure(test_case) @@ -47,37 +52,51 @@ def measure(self, test_case: Dict): # sends to server try: from ragas import evaluate - from ragas.metrics import ( - answer_correctness, - answer_relevancy, - answer_similarity, - context_precision, - context_recall, - context_utilization, - faithfulness, - ) + from ragas.metrics import ALL_METRICS + + self.metric_names = [metric.__class__.__name__ for metric in ALL_METRICS] + self.metric_names = [re.sub(r"(? Date: Tue, 8 Oct 2024 10:11:36 +0800 Subject: [PATCH 24/32] Optimize path and link validity check. (#143) * Optimize path and link validity check. Signed-off-by: ZePan110 --- .github/workflows/pr-path-detection.yml | 86 ++++++++++++++----------- 1 file changed, 50 insertions(+), 36 deletions(-) diff --git a/.github/workflows/pr-path-detection.yml b/.github/workflows/pr-path-detection.yml index 2bfb3969..ec856355 100644 --- a/.github/workflows/pr-path-detection.yml +++ b/.github/workflows/pr-path-detection.yml @@ -17,28 +17,39 @@ jobs: - name: Checkout Repo GenAIEval uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Check the Validity of Hyperlinks run: | cd ${{github.workspace}} fail="FALSE" - url_lines=$(grep -Eo '\]\(http[s]?://[^)]+\)' --include='*.md' -r .|grep -Ev 'GenAIEval/blob/main') - if [ -n "$url_lines" ]; then - for url_line in $url_lines; do - url=$(echo "$url_line"|cut -d '(' -f2 | cut -d ')' -f1|sed 's/\.git$//') - path=$(echo "$url_line"|cut -d':' -f1 | cut -d'/' -f2-) - response=$(curl -L -s -o /dev/null -w "%{http_code}" "$url") - if [ "$response" -ne 200 ]; then - echo "**********Validation failed, try again**********" - response_retry=$(curl -s -o /dev/null -w "%{http_code}" "$url") - if [ "$response_retry" -eq 200 ]; then - echo "*****Retry successfully*****" - else - echo "Invalid link from ${{github.workspace}}/$path: $url" - fail="TRUE" - fi + merged_commit=$(git log -1 --format='%H') + changed_files="$(git diff --name-status --diff-filter=ARM ${{ github.event.pull_request.base.sha }} ${merged_commit} | awk '/\.md$/ {print $NF}')" + if [ -n "changed_files" ]; then + for changed_file in $changed_files; do + url_lines=$(grep -H -Eo '\]\(http[s]?://[^)]+\)' "$changed_file" | grep -Ev 'GenAIEval/blob/main') + if [ -n "$url_lines" ]; then + for url_line in $url_lines; do + url=$(echo "$url_line"|cut -d '(' -f2 | cut -d ')' -f1|sed 's/\.git$//') + path=$(echo "$url_line"|cut -d':' -f1 | cut -d'/' -f2-) + + response=$(curl -L -s -o /dev/null -w "%{http_code}" "$url") + if [ "$response" -ne 200 ]; then + echo "**********Validation failed, try again**********" + response_retry=$(curl -s -o /dev/null -w "%{http_code}" "$url") + if [ "$response_retry" -eq 200 ]; then + echo "*****Retry successfully*****" + else + echo "Invalid link from ${{github.workspace}}/$path: $url" + fail="TRUE" + fi + fi + done fi done + else + echo "No changed .md file." fi if [[ "$fail" == "TRUE" ]]; then @@ -56,6 +67,8 @@ jobs: - name: Checkout Repo GenAIEval uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Checking Relative Path Validity run: | @@ -69,33 +82,31 @@ jobs: branch="https://github.com/opea-project/GenAIEval/blob/${{ github.event.pull_request.head.ref }}" fi link_head="https://github.com/opea-project/GenAIEval/blob/main" + png_lines=$(grep -Eo '\]\([^)]+\)' --include='*.md' -r .|grep -Ev 'http') if [ -n "$png_lines" ]; then for png_line in $png_lines; do refer_path=$(echo "$png_line"|cut -d':' -f1 | cut -d'/' -f2-) png_path=$(echo "$png_line"|cut -d '(' -f2 | cut -d ')' -f1) if [[ "${png_path:0:1}" == "/" ]]; then - check_path=${{github.workspace}}$png_path - elif [[ "${png_path:0:1}" == "#" ]]; then - check_path=${{github.workspace}}/$refer_path$png_path + check_path=$png_path + elif [[ "$png_path" == *#* ]]; then + relative_path=$(echo "$png_path" | cut -d '#' -f1) + if [ -n "$relative_path" ]; then + check_path=$(dirname "$refer_path")/$relative_path + png_path=$(echo "$png_path" | awk -F'#' '{print "#" $2}') + else + check_path=$refer_path + fi else - check_path=${{github.workspace}}/$(dirname "$refer_path")/$png_path + check_path=$(dirname "$refer_path")/$png_path fi - real_path=$(realpath $check_path) - if [ $? -ne 0 ]; then - echo "Path $png_path in file ${{github.workspace}}/$refer_path does not exist" - fail="TRUE" - else - url=$link_head$(echo "$real_path" | sed 's|.*/GenAIEval||') - response=$(curl -I -L -s -o /dev/null -w "%{http_code}" "$url") - if [ "$response" -ne 200 ]; then - echo "**********Validation failed, try again**********" - response_retry=$(curl -s -o /dev/null -w "%{http_code}" "$url") - if [ "$response_retry" -eq 200 ]; then - echo "*****Retry successfully*****" - else - echo "Retry failed. Check branch ${{ github.event.pull_request.head.ref }}" - url_dev=$branch$(echo "$real_path" | sed 's|.*/GenAIEval||') + + if [ -e "$check_path" ]; then + real_path=$(realpath $check_path) + if [[ "$png_line" == *#* ]]; then + if [ -n "changed_files" ] && echo "$changed_files" | grep -q "^${refer_path}$"; then + url_dev=$branch$(echo "$real_path" | sed 's|.*/GenAIEval||')$png_path response=$(curl -I -L -s -o /dev/null -w "%{http_code}" "$url_dev") if [ "$response" -ne 200 ]; then echo "**********Validation failed, try again**********" @@ -103,14 +114,17 @@ jobs: if [ "$response_retry" -eq 200 ]; then echo "*****Retry successfully*****" else - echo "Invalid path from ${{github.workspace}}/$refer_path: $png_path" + echo "Invalid path from ${{github.workspace}}/$refer_path: $png_path, link: $url_dev" fail="TRUE" fi else - echo "Check branch ${{ github.event.pull_request.head.ref }} successfully." + echo "Validation succeed $png_line" fi fi fi + else + echo "$check_path does not exist" + fail="TRUE" fi done fi From d4c3391eafaf66c0fd3c1b1fd24fd96af1eff526 Mon Sep 17 00:00:00 2001 From: rowenaal Date: Mon, 7 Oct 2024 19:11:59 -0700 Subject: [PATCH 25/32] Signed-off-by: Rowena Almeida (#150) Signed-off-by: rowenaal --- evals/benchmark/grafana/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evals/benchmark/grafana/README.md b/evals/benchmark/grafana/README.md index 8e7fe83a..2a3cdf8c 100644 --- a/evals/benchmark/grafana/README.md +++ b/evals/benchmark/grafana/README.md @@ -62,7 +62,8 @@ password: admin If you have any Grafana installation issue please check this [link](https://grafana.com/docs/grafana/latest/setup-grafana/installation/). -The next step is to configure the data source for Grafana to scrape metrics from. Click on the "Data Source" button, select Prometheus, and specify the Prometheus url `localhost:9090`. +The next step is to configure the data source for Grafana to scrape metrics from. Click on the "Data Source" button, select Prometheus, and specify the Prometheus url `localhost:9090`. If the dashboard does not display data, under the `Other section` for the Data Source, change the HTTP method to `GET`. + ## 3. Import Grafana Dashboard After setup the Grafana server, then you can import a Grafana Dashboard through uploading a dashboard JSON file in the Grafana UI under `Home > Dashboards > Import dashboard`. You can use a file like [tgi_grafana.json](https://github.com/huggingface/text-generation-inference/blob/main/assets/tgi_grafana.json). From e3210035e0b3ad4c33865d24a30b0c05171f33a1 Mon Sep 17 00:00:00 2001 From: Zhenzhong1 <109137058+Zhenzhong1@users.noreply.github.com> Date: Thu, 10 Oct 2024 10:10:32 +0800 Subject: [PATCH 26/32] [Benchmark] Get benchmark reports. (#155) * added output_folders * updated subprocess run for output folders * get report done * fixed the output_folder issues * add the func of run_benchmark * add return * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- evals/benchmark/benchmark.py | 37 +++++++++++++++++-- .../benchmark/stresscli/commands/load_test.py | 2 + evals/benchmark/stresscli/commands/report.py | 20 ++++++++++ 3 files changed, 55 insertions(+), 4 deletions(-) diff --git a/evals/benchmark/benchmark.py b/evals/benchmark/benchmark.py index 77cab096..3a612928 100644 --- a/evals/benchmark/benchmark.py +++ b/evals/benchmark/benchmark.py @@ -1,7 +1,9 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import argparse import os +import subprocess from datetime import datetime import yaml @@ -251,18 +253,21 @@ def run_service_test(example, service_type, service, test_suite_config): ) # Run the test using locust_runtests function + output_folders = [] for index, run_yaml_path in enumerate(run_yaml_paths, start=1): print(f"[OPEA BENCHMARK] ๐Ÿš€ The {index} time test is running, run yaml: {run_yaml_path}...") - locust_runtests(None, run_yaml_path) + output_folders.append(locust_runtests(None, run_yaml_path)) print(f"[OPEA BENCHMARK] ๐Ÿš€ Test completed for {service_name} at {url}") + return output_folders + def process_service(example, service_type, case_data, test_suite_config): service = case_data.get(service_type) if service and service.get("run_test"): print(f"[OPEA BENCHMARK] ๐Ÿš€ Example: {example} Service: {service.get('service_name')}, Running test...") - run_service_test(example, service_type, service, test_suite_config) + return run_service_test(example, service_type, service, test_suite_config) def check_test_suite_config(test_suite_config): @@ -284,7 +289,7 @@ def check_test_suite_config(test_suite_config): raise ValueError("Must specify either run_time or user_queries.") -if __name__ == "__main__": +def run_benchmark(report=False): # Load test suit configuration yaml_content = load_yaml("./benchmark.yaml") # Extract data @@ -324,9 +329,33 @@ def check_test_suite_config(test_suite_config): "visualqna": ["lvm", "lvmserve", "e2e"], } + all_output_folders = [] # Process each example's services for example in parsed_data["examples"]: case_data = parsed_data["all_case_data"].get(example, {}) service_types = example_service_map.get(example, []) for service_type in service_types: - process_service(example, service_type, case_data, test_suite_config) + output_folder = process_service(example, service_type, case_data, test_suite_config) + if output_folder is not None: + all_output_folders.append(output_folder) + + if report: + print(all_output_folders) + all_results = dict() + for each_bench_folders in all_output_folders: + for folder in each_bench_folders: + from stresscli.commands.report import get_report_results + + results = get_report_results(folder) + all_results[folder] = results + print(f"results = {results}\n") + + return all_results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Read and parse JSON/YAML files and output JSON file") + parser.add_argument("--report", help="Return the perf", action="store_true") + args = parser.parse_args() + + run_benchmark(report=args.report) diff --git a/evals/benchmark/stresscli/commands/load_test.py b/evals/benchmark/stresscli/commands/load_test.py index ff7f2945..629a5ca8 100644 --- a/evals/benchmark/stresscli/commands/load_test.py +++ b/evals/benchmark/stresscli/commands/load_test.py @@ -74,6 +74,8 @@ def locust_runtests(kubeconfig, profile): click.echo(f"Load test results saved to {base_folder}") + return base_folder + def collect_metrics(collector, services, output_dir, namespace=None): """Collect metrics from the specified services and output directory. diff --git a/evals/benchmark/stresscli/commands/report.py b/evals/benchmark/stresscli/commands/report.py index 509bfaa9..ae334aaa 100644 --- a/evals/benchmark/stresscli/commands/report.py +++ b/evals/benchmark/stresscli/commands/report.py @@ -74,6 +74,26 @@ def report(ctx, folder, format, output): csvwriter.writerow(row) +def get_report_results(folder): + """Print the test report.""" + print(f"Get report results from: {folder}") + output_data = {} + testcases = get_testcases(folder) + for testcase in testcases: + include = "|".join([TESTSPEC_SECTION_NAME, CSV_SECTION_NAME, METRICS_SECTION_NAME]) + extracted_data = export_testdata(testcase, folder, include) + if extracted_data: + output_data[testcase] = extracted_data + + result = {} + for testcase, data in output_data.items(): + testcase_result = {} + for key, value in data.items(): + testcase_result[key] = value + result[testcase] = testcase_result + return result + + def export_testspec(testcase, folder): testspec_path = os.path.join(folder, f"{testcase}_testspec.yaml") extracted_data = {} From f1d2099fdaa7b38b40c2bfac3804106a9b99a3e7 Mon Sep 17 00:00:00 2001 From: Yi Yao Date: Thu, 10 Oct 2024 10:10:45 +0800 Subject: [PATCH 27/32] Support sharegpt dataset in chatqna e2e test (#152) * Support sharegpt dataset in chatqna e2e test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Change the log level for selected questions --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- evals/benchmark/README.md | 7 +++++ evals/benchmark/benchmark.py | 4 +++ evals/benchmark/benchmark.yaml | 12 +++++++-- .../benchmark/stresscli/commands/load_test.py | 18 +++++++++++-- evals/benchmark/stresscli/locust/aistress.py | 17 ++++++++++++ .../stresscli/locust/chatqnabench.py | 26 ++++++++++++++++--- 6 files changed, 77 insertions(+), 7 deletions(-) diff --git a/evals/benchmark/README.md b/evals/benchmark/README.md index e9bc2b6a..726513e9 100644 --- a/evals/benchmark/README.md +++ b/evals/benchmark/README.md @@ -72,6 +72,7 @@ test_suite_config: arrival-rate: 1.0 # Request arrival rate warm_ups: 0 # Number of test requests for warm-ups run_time: 60m # Total runtime for the test suite + seed: # The seed for all RNGs user_queries: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] # Number of test requests query_timeout: 120 # Number of seconds to wait for a simulated user to complete any executing task before exiting. 120 sec by defeult. random_prompt: false # Use random prompts if true, fixed prompts if false @@ -112,4 +113,10 @@ test_cases: e2e: run_test: true service_name: "chatqna-backend-server-svc" + service_list: # Replace with your k8s service names if deploy with k8s + # or container names if deploy with Docker for metrics collection, + # activate if collect_service_metric is true + - "chatqna-backend-server-svc" + dataset: # Activate if random_prompt=true: leave blank = default dataset(WebQuestions) or sharegpt ``` +If you'd like to use sharegpt dataset, please download the dataset according to the [guide](https://github.com/lm-sys/FastChat/issues/90#issuecomment-1493250773). Merge all downloaded data files into one file named sharegpt.json and put the file at `evals/benchmark/stresscli/dataset`. diff --git a/evals/benchmark/benchmark.py b/evals/benchmark/benchmark.py index 3a612928..6479d4ba 100644 --- a/evals/benchmark/benchmark.py +++ b/evals/benchmark/benchmark.py @@ -54,6 +54,7 @@ def extract_test_case_data(content): "service_port": test_suite_config.get("service_port"), "load_shape": test_suite_config.get("load_shape"), "query_timeout": test_suite_config.get("query_timeout", 120), + "seed": test_suite_config.get("seed", None), "all_case_data": { example: content["test_cases"].get(example, {}) for example in test_suite_config.get("examples", []) }, @@ -93,6 +94,8 @@ def create_run_yaml_content(service, base_url, bench_target, test_phase, num_que "bench-target": bench_target, "service-metric-collect": test_params["collect_service_metric"], "service-list": service.get("service_list", []), + "dataset": service.get("dataset", "default"), + "seed": test_params.get("seed", None), "llm-model": test_params["llm_model"], "deployment-type": test_params["deployment_type"], "load-shape": test_params["load_shape"], @@ -307,6 +310,7 @@ def run_benchmark(report=False): "load_shape": parsed_data["load_shape"], "query_timeout": parsed_data["query_timeout"], "warm_ups": parsed_data["warm_ups"], + "seed": parsed_data["seed"], } check_test_suite_config(test_suite_config) diff --git a/evals/benchmark/benchmark.yaml b/evals/benchmark/benchmark.yaml index 70ba4a61..37e9e61f 100644 --- a/evals/benchmark/benchmark.yaml +++ b/evals/benchmark/benchmark.yaml @@ -8,6 +8,7 @@ test_suite_config: # Overall configuration settings for the test suite service_port: None # Leave as None for k8s, specify for Docker warm_ups: 0 # Number of test requests for warm-up run_time: 60m # The max total run time for the test suite + seed: # The seed for all RNGs user_queries: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] # Number of test requests at each concurrency level query_timeout: 120 # Number of seconds to wait for a simulated user to complete any executing task before exiting. 120 sec by defeult. random_prompt: false # Use random prompts if true, fixed prompts if false @@ -67,8 +68,15 @@ test_cases: service_list: # Replace with your k8s service names if deploy with k8s # or container names if deploy with Docker for metrics collection, # activate if collect_service_metric is true - - "chatqna-tei" - - "chatqna-teirerank" + - "chatqna-backend-server-svc" + - "chatqna-nginx-svc" + - "dataprep-svc" + - "embedding-dependency-svc" + - "llm-dependency-svc" + - "reranking-dependency-svc" + - "retriever-svc" + - "vector-db" + dataset: # Activate if random_prompt=true: leave blank = default dataset(WebQuestions) or sharegpt codegen: llm: diff --git a/evals/benchmark/stresscli/commands/load_test.py b/evals/benchmark/stresscli/commands/load_test.py index 629a5ca8..4c015454 100644 --- a/evals/benchmark/stresscli/commands/load_test.py +++ b/evals/benchmark/stresscli/commands/load_test.py @@ -32,6 +32,8 @@ "max-request": 100, "namespace": "default", "load-shape": {"name": DEFAULT_LOADSHAPE}, + "dataset": "default", + "seed": "none", } console_logger = logging.getLogger("opea.eval") @@ -130,7 +132,10 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in "deployment-type", global_settings.get("deployment-type", locust_defaults["deployment-type"]) ) runspec["namespace"] = run_settings.get("namespace", global_settings.get("namespace", locust_defaults["namespace"])) - + runspec["dataset"] = run_settings.get("dataset", global_settings.get("dataset", locust_defaults["dataset"])) + runspec["dataset"] = locust_defaults["dataset"] if runspec["dataset"] is None else runspec["dataset"] + runspec["seed"] = run_settings.get("seed", global_settings.get("seed", locust_defaults["seed"])) + runspec["seed"] = locust_defaults["seed"] if runspec["seed"] is None else runspec["seed"] runspec["run_name"] = run_settings["name"] # Specify load shape to adjust user distribution @@ -193,7 +198,12 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in processes = 10 if concurrent_level > 400 else 5 if concurrent_level > 200 else processes elif load_shape == "poisson": if load_shape_params and "arrival-rate" in load_shape_params: - processes = max(2, math.ceil(int(load_shape_params["arrival-rate"]) / 10)) + processes = max(2, math.ceil(int(load_shape_params["arrival-rate"]) / 5)) + else: + if load_shape_params and "arrival-rate" in load_shape_params: + processes = max(2, math.ceil(int(load_shape_params["arrival-rate"]) / 5)) + elif runspec["max_requests"] > 0: + processes = 10 if runspec["max_requests"] > 2000 else 5 if runspec["max_requests"] > 1000 else processes cmd = [ "locust", @@ -205,6 +215,10 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in runspec["runtime"], "--load-shape", runspec["load-shape"], + "--dataset", + runspec["dataset"], + "--seed", + str(runspec["seed"]), "--processes", str(processes), "--users", diff --git a/evals/benchmark/stresscli/locust/aistress.py b/evals/benchmark/stresscli/locust/aistress.py index d24638eb..52c030ba 100644 --- a/evals/benchmark/stresscli/locust/aistress.py +++ b/evals/benchmark/stresscli/locust/aistress.py @@ -50,6 +50,20 @@ def _(parser): default="constant", help="load shape to adjust conccurency at runtime", ) + parser.add_argument( + "--dataset", + type=str, + env_var="OPEA_EVAL_DATASET", + default="default", + help="dataset", + ) + parser.add_argument( + "--seed", + type=str, + env_var="OPEA_EVAL_SEED", + default="none", + help="The seed for all RNGs", + ) reqlist = [] @@ -188,11 +202,14 @@ def on_test_start(environment, **kwargs): console_logger.info(f"Http timeout : {environment.parsed_options.http_timeout}\n") console_logger.info(f"Benchmark target : {environment.parsed_options.bench_target}\n") console_logger.info(f"Load shape : {environment.parsed_options.load_shape}") + console_logger.info(f"Dataset : {environment.parsed_options.dataset}") @events.init.add_listener def on_locust_init(environment, **_kwargs): global bench_package + os.environ["OPEA_EVAL_DATASET"] = environment.parsed_options.dataset + os.environ["OPEA_EVAL_SEED"] = environment.parsed_options.seed try: bench_package = __import__(environment.parsed_options.bench_target) except ImportError: diff --git a/evals/benchmark/stresscli/locust/chatqnabench.py b/evals/benchmark/stresscli/locust/chatqnabench.py index e5a4414b..1b4473c0 100644 --- a/evals/benchmark/stresscli/locust/chatqnabench.py +++ b/evals/benchmark/stresscli/locust/chatqnabench.py @@ -9,7 +9,15 @@ import tokenresponse as token cwd = os.path.dirname(__file__) -filename = f"{cwd}/../dataset/chatqna.json" +dataset = os.environ["OPEA_EVAL_DATASET"] +if dataset == "sharegpt": + filename = f"{cwd}/../dataset/sharegpt.json" +elif dataset == "default": + filename = f"{cwd}/../dataset/chatqna.json" +else: + logging.error(f"Dataset not found: dataset/{dataset}.json.") + exit() + qlist = [] try: with open(filename) as qfile: @@ -18,6 +26,10 @@ logging.error(f"Question File open failed: {filename}") exit() +seed = os.environ["OPEA_EVAL_SEED"] +if seed and seed != "none": + random.seed(seed) + def getUrl(): return "/v1/chatqna" @@ -26,9 +38,17 @@ def getUrl(): def getReqData(): qlen = len(qlist) qid = random.randint(0, qlen - 1) - logging.debug(f"Selected question: {qlist[qid]['qText']}") - return {"messages": qlist[qid]["qText"], "max_tokens": 128} + if dataset == "sharegpt": + msg = qlist[qid]["conversations"][0]["value"] + elif dataset == "default": + msg = qlist[qid]["qText"] + else: + msg = qlist[qid]["qText"] + + logging.debug(f"Selected question: {msg}") + + return {"messages": msg, "max_tokens": 128} def respStatics(environment, reqData, respData): From ddd360734e760cb7296994c708228621249450bc Mon Sep 17 00:00:00 2001 From: ZePan110 Date: Fri, 11 Oct 2024 09:40:42 +0800 Subject: [PATCH 28/32] Fix the issue of exiting due to inability to find hyperlinks. (#156) Signed-off-by: ZePan110 --- .github/workflows/pr-path-detection.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-path-detection.yml b/.github/workflows/pr-path-detection.yml index ec856355..b20f75ca 100644 --- a/.github/workflows/pr-path-detection.yml +++ b/.github/workflows/pr-path-detection.yml @@ -26,9 +26,9 @@ jobs: fail="FALSE" merged_commit=$(git log -1 --format='%H') changed_files="$(git diff --name-status --diff-filter=ARM ${{ github.event.pull_request.base.sha }} ${merged_commit} | awk '/\.md$/ {print $NF}')" - if [ -n "changed_files" ]; then + if [ -n "$changed_files" ]; then for changed_file in $changed_files; do - url_lines=$(grep -H -Eo '\]\(http[s]?://[^)]+\)' "$changed_file" | grep -Ev 'GenAIEval/blob/main') + url_lines=$(grep -H -Eo '\]\(http[s]?://[^)]+\)' "$changed_file" | grep -Ev 'GenAIEval/blob/main') || true if [ -n "$url_lines" ]; then for url_line in $url_lines; do url=$(echo "$url_line"|cut -d '(' -f2 | cut -d ')' -f1|sed 's/\.git$//') From e8a98502befc8111f263c36ab4291f52ca03406d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 11 Oct 2024 02:55:05 +0000 Subject: [PATCH 29/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- evals/metrics/auto_eval/prompt_engineering.py | 9 +++++---- .../metrics/auto_eval/prompt_templates/__init__.py | 13 ++++--------- .../auto_eval/prompt_templates/correctness.py | 11 ++++++----- .../auto_eval/prompt_templates/factualness.py | 13 +++++++------ .../auto_eval/prompt_templates/opening_prompt.py | 11 ++++++----- .../auto_eval/prompt_templates/readability.py | 11 ++++++----- .../metrics/auto_eval/prompt_templates/relevance.py | 11 ++++++----- evals/metrics/auto_eval/run_eval.py | 10 +++------- 8 files changed, 43 insertions(+), 46 deletions(-) diff --git a/evals/metrics/auto_eval/prompt_engineering.py b/evals/metrics/auto_eval/prompt_engineering.py index 17bebda3..3ab6e7e1 100644 --- a/evals/metrics/auto_eval/prompt_engineering.py +++ b/evals/metrics/auto_eval/prompt_engineering.py @@ -2,15 +2,16 @@ # SPDX-License-Identifier: Apache-2.0 from jinja2 import Template -from .prompt_templates import NAME2METRIC + from .prompt_templates import * +from .prompt_templates import NAME2METRIC class Prompt: """Class to customize prompt template using user-defined list of metrics.""" def __init__(self, metrics, input_fields): - self.metrics = metrics + self.metrics = metrics self.input_fields = input_fields self.template = self.load_prompt_template() @@ -37,10 +38,10 @@ def create_closing_prompt(self): return "\n\n".join(closing_prompt) def load_prompt_template(self): - content = [] + content = [] for metric_name in ["opening_prompt"] + self.metrics: metric_instance = NAME2METRIC[metric_name] - content += metric_instance.template, + content += (metric_instance.template,) content += (self.create_grading_format(),) content += (self.create_closing_prompt(),) return Template("\n\n".join(content)) diff --git a/evals/metrics/auto_eval/prompt_templates/__init__.py b/evals/metrics/auto_eval/prompt_templates/__init__.py index 0c49b0a6..2b3979ba 100644 --- a/evals/metrics/auto_eval/prompt_templates/__init__.py +++ b/evals/metrics/auto_eval/prompt_templates/__init__.py @@ -8,19 +8,14 @@ from .relevance import Relevance from .readability import Readability -__all__ = [ - "opening_prompt", - "correctness", - "factualness", - "relevance", - "readability" -] +__all__ = ["opening_prompt", "correctness", "factualness", "relevance", "readability"] NAME2METRIC = {} + def snake2camel(s): - return ''.join(x.capitalize() or '_' for x in s.split('_')) + return "".join(x.capitalize() or "_" for x in s.split("_")) + for name in __all__: NAME2METRIC[name] = eval(snake2camel(name)) - \ No newline at end of file diff --git a/evals/metrics/auto_eval/prompt_templates/correctness.py b/evals/metrics/auto_eval/prompt_templates/correctness.py index 52afc089..a328d3d2 100644 --- a/evals/metrics/auto_eval/prompt_templates/correctness.py +++ b/evals/metrics/auto_eval/prompt_templates/correctness.py @@ -1,12 +1,13 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + class Correctness: name = "correctness" - required_columns = ['answer', 'context', 'question'] - template = """- Correctness: correctness measures how accurately and comprehensively does the answer resolve problem posed in the question. - - Score 1: If the answer is empty string or something like "I do not know the answer", the correctness score is 1. + required_columns = ["answer", "context", "question"] + template = """- Correctness: correctness measures how accurately and comprehensively does the answer resolve problem posed in the question. + - Score 1: If the answer is empty string or something like "I do not know the answer", the correctness score is 1. - Score 2: If the answer only addresses a small part of the question correctly or it is missing many critical steps/aspects of the answer or the answer is too short to fully answer the question or is missing many steps causing the answer to not fully address the problem described in the question, then the correctness score is 2. - - Score 3: The answer mostly addresses the question but one critical aspect/step is missing or is incorrect. + - Score 3: The answer mostly addresses the question but one critical aspect/step is missing or is incorrect. - Score 4: the answer mostly answer the question and covers all critical/main aspects of the question, but itโ€™s missing important/necessary details about one or more aspects. - - Score 5: the answer correctly and completely addresses the query. It also covers important details about each step.""" \ No newline at end of file + - Score 5: the answer correctly and completely addresses the query. It also covers important details about each step.""" diff --git a/evals/metrics/auto_eval/prompt_templates/factualness.py b/evals/metrics/auto_eval/prompt_templates/factualness.py index 15386f6c..7fa6dfee 100644 --- a/evals/metrics/auto_eval/prompt_templates/factualness.py +++ b/evals/metrics/auto_eval/prompt_templates/factualness.py @@ -1,12 +1,13 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + class Factualness: name = "factualness" - required_columns = ['answer', 'context'] - template = """- Factualness: Factualness assesses how much of the provided answer is contained within the provided context. A higher score indicates that a higher proportion of claims present in the answer are present or can be derived from the provided context. - - Score 1: the answer is completely hallucinated i.e. not contained in the context at all or there is no answer. + required_columns = ["answer", "context"] + template = """- Factualness: Factualness assesses how much of the provided answer is contained within the provided context. A higher score indicates that a higher proportion of claims present in the answer are present or can be derived from the provided context. + - Score 1: the answer is completely hallucinated i.e. not contained in the context at all or there is no answer. - Score 2: only a small part of the answer is contained in the context but most of it is imaginary/hallucinated or the meaning is completely changed from what is represented in the context. - - Score 3: Only about half of the answer is contained in the context. Rest of the answer is hallucinated or imaginary. - - Score 4: Most of the claims in the answer can be inferred from the provided context with very little information that is not directly supported by the provided context. - - Score 5: All of the claims in the answer are directly supported by the provided context, demonstrating high faithfulness to the provided context.""" \ No newline at end of file + - Score 3: Only about half of the answer is contained in the context. Rest of the answer is hallucinated or imaginary. + - Score 4: Most of the claims in the answer can be inferred from the provided context with very little information that is not directly supported by the provided context. + - Score 5: All of the claims in the answer are directly supported by the provided context, demonstrating high faithfulness to the provided context.""" diff --git a/evals/metrics/auto_eval/prompt_templates/opening_prompt.py b/evals/metrics/auto_eval/prompt_templates/opening_prompt.py index c81c243d..441f371e 100644 --- a/evals/metrics/auto_eval/prompt_templates/opening_prompt.py +++ b/evals/metrics/auto_eval/prompt_templates/opening_prompt.py @@ -1,6 +1,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + class OpeningPrompt: name = "opening_prompt" required_columns = [] @@ -8,13 +9,13 @@ class OpeningPrompt: template = """Consider yourself as an helpful, truthful and impartial judge. Your task: -You will be given an input consisting of a question, an answer and a context. Your task is to act as an impartial judge and provide a numerical score between 1 to 5 for each of the following metrics for the given answer. +You will be given an input consisting of a question, an answer and a context. Your task is to act as an impartial judge and provide a numerical score between 1 to 5 for each of the following metrics for the given answer. Important rules for you while completing this task: -1. You MUST ALWAYS provide a score for every metric mentioned below. +1. You MUST ALWAYS provide a score for every metric mentioned below. 2. Make sure to understand definition of every metric fully before completing your task. Every metric is provided with grading scale and rubric. You MUST use this grading scale and rubric to determine your score. -3. Ensure that your scores and reasoning for every metric is independent of each other e.g., score for factualness should not impact score for correctness and vice versa. -4. Base your grading decision only on the given inputs and do not speculate or hallucinate. -5. You must also provide reasoning for your score in a single sentence. +3. Ensure that your scores and reasoning for every metric is independent of each other e.g., score for factualness should not impact score for correctness and vice versa. +4. Base your grading decision only on the given inputs and do not speculate or hallucinate. +5. You must also provide reasoning for your score in a single sentence. Your metric definitions along with grading scale and rubric:""" diff --git a/evals/metrics/auto_eval/prompt_templates/readability.py b/evals/metrics/auto_eval/prompt_templates/readability.py index 684464b9..4c03e6e7 100644 --- a/evals/metrics/auto_eval/prompt_templates/readability.py +++ b/evals/metrics/auto_eval/prompt_templates/readability.py @@ -1,12 +1,13 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + class Readability: name = "readability" - required_columns = ['answer'] - template = """- Readability: Readability measures clarity and lucidity of the answer. Readability is measured solely based on the answer and it does not consider the question or the context. - - Score 1: the answer is empty or "I do not know the answer" or completely unreadable or No meaningful information can be extracted from the answer, then the score is 1. + required_columns = ["answer"] + template = """- Readability: Readability measures clarity and lucidity of the answer. Readability is measured solely based on the answer and it does not consider the question or the context. + - Score 1: the answer is empty or "I do not know the answer" or completely unreadable or No meaningful information can be extracted from the answer, then the score is 1. - Score 2: the answer is slightly readable, there are irrelevant symbols or HTML tags or repeated words, but it can roughly form a meaningful sentence that can cover some aspects of the answer. - Score 3: Answer can be read but there are grammatical mistakes in the answer. - - Score 4: the answer readable, but the readability and style can improved to better appeal to the reader. - - Score 5: the answer is reader friendly and well written.""" \ No newline at end of file + - Score 4: the answer readable, but the readability and style can improved to better appeal to the reader. + - Score 5: the answer is reader friendly and well written.""" diff --git a/evals/metrics/auto_eval/prompt_templates/relevance.py b/evals/metrics/auto_eval/prompt_templates/relevance.py index 1d7c7a76..33743ecc 100644 --- a/evals/metrics/auto_eval/prompt_templates/relevance.py +++ b/evals/metrics/auto_eval/prompt_templates/relevance.py @@ -1,12 +1,13 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + class Relevance: name = "relevance" - required_columns = ['question', 'answer'] - template = """- Relevance: Relevance measures how well the answer relates to the question. + required_columns = ["question", "answer"] + template = """- Relevance: Relevance measures how well the answer relates to the question. - Score 1: The answer doesn't mention anything about the question or is completely irrelevant to the question. - - Score 2: The answer only identifies the domain (e.g. cnvrg) mentioned in the question and provides information from the correct domain. But, the answer does not address the question itself and the point of the question is completely missed by it. + - Score 2: The answer only identifies the domain (e.g. cnvrg) mentioned in the question and provides information from the correct domain. But, the answer does not address the question itself and the point of the question is completely missed by it. - Score 3: The answer correctly identifies the domain and essence of the question but the details in the answer are not relevant to the focus of the question. - - Score 4: The answer correctly identifies domain mentioned the question and essence of the question as well as stays consistent with both of them. But there is some part of the answer that is not relevant to the question or it's topic or it's essence. This irrelevant part is damaging the overall relevance of the answer. - - Score 5: The answer is completely relevant to the question and the details do not deviate from the essence of the question. There are no parts of the answer that are irrelevant or unnecessary for the given question.""" \ No newline at end of file + - Score 4: The answer correctly identifies domain mentioned the question and essence of the question as well as stays consistent with both of them. But there is some part of the answer that is not relevant to the question or it's topic or it's essence. This irrelevant part is damaging the overall relevance of the answer. + - Score 5: The answer is completely relevant to the question and the details do not deviate from the essence of the question. There are no parts of the answer that are irrelevant or unnecessary for the given question.""" diff --git a/evals/metrics/auto_eval/run_eval.py b/evals/metrics/auto_eval/run_eval.py index 9d35c2aa..affc9f3c 100644 --- a/evals/metrics/auto_eval/run_eval.py +++ b/evals/metrics/auto_eval/run_eval.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import time + from huggingface_hub import login from .prompt_engineering import Prompt @@ -71,6 +72,7 @@ def measure(self): print("Generation of scores and reasoning took {:.2f} seconds for {:,} examples".format(end - start, n_samples)) return responses + if __name__ == "__main__": dataset = "explodinggradients/ragas-wikiqa" @@ -82,7 +84,7 @@ def measure(self): evaluation_mode = "openai" openai_key = "" - model_name="gpt-4o" + model_name = "gpt-4o" evaluation_metrics = ["factualness", "relevance", "correctness", "readability"] @@ -101,9 +103,3 @@ def measure(self): for response in responses: print(response) - - - - - - \ No newline at end of file From c332999cad85d4ce2b8df078c6e1690cd6084314 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 11 Oct 2024 04:21:52 +0000 Subject: [PATCH 30/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/requirements.txt b/tests/requirements.txt index dbfd15b5..981396be 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,3 +1,6 @@ +<<<<<<< HEAD +======= +>>>>>>> 9157b83936e3064768683d2c4f7dd904a2689d1f bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@e5c2f31625223431d7987f43b70b75b9d26ba118 datasets jieba @@ -6,8 +9,5 @@ langchain_community langchain_huggingface lm-eval==0.4.3 openai -<<<<<<< HEAD python-dotenv -======= ->>>>>>> 9157b83936e3064768683d2c4f7dd904a2689d1f ragas==0.1.19 From 68a14a0d15a53a2b3bf9f3b17ed0334b1f896e20 Mon Sep 17 00:00:00 2001 From: aasavari Date: Thu, 10 Oct 2024 21:24:44 -0700 Subject: [PATCH 31/32] Removing python-dotenv from requirements Signed-off-by: aasavari --- tests/requirements.txt | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/requirements.txt b/tests/requirements.txt index 981396be..1bbc3cad 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,6 +1,3 @@ -<<<<<<< HEAD -======= ->>>>>>> 9157b83936e3064768683d2c4f7dd904a2689d1f bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@e5c2f31625223431d7987f43b70b75b9d26ba118 datasets jieba @@ -9,5 +6,4 @@ langchain_community langchain_huggingface lm-eval==0.4.3 openai -python-dotenv -ragas==0.1.19 +ragas==0.1.19 \ No newline at end of file From 56ad626fc6f274dd2909c21a539907db40c3fc67 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 11 Oct 2024 04:29:36 +0000 Subject: [PATCH 32/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/requirements.txt b/tests/requirements.txt index 1bbc3cad..3f809f3e 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -6,4 +6,4 @@ langchain_community langchain_huggingface lm-eval==0.4.3 openai -ragas==0.1.19 \ No newline at end of file +ragas==0.1.19