diff --git a/evals/metrics/auto_eval/.env b/evals/metrics/auto_eval/.env new file mode 100644 index 00000000..780dfc14 --- /dev/null +++ b/evals/metrics/auto_eval/.env @@ -0,0 +1 @@ +OPENAI_KEY=xxx \ No newline at end of file diff --git a/evals/metrics/auto_eval/README.md b/evals/metrics/auto_eval/README.md new file mode 100644 index 00000000..81c8cd15 --- /dev/null +++ b/evals/metrics/auto_eval/README.md @@ -0,0 +1,57 @@ +# Auto (annotation-free) Evaluation of Retrieval Augmented Generation + +We provide easy-to-use, flexible and annotation-free RAG evaluation tool using LLM-as-a-judge while benefitting from Intel's Gaudi2 AI accelator chips. + +## Overview +### Data +AutoEval is best suited for Long Form Question Answering (LFQA) datasets where you want to gauge quality and factualness of the answer via LLM's intelligence. Here, you can use benchmarking datasets or bring your own custom datasets. Please make sure to set `field_map` to map AutoEval fields such as "question" to your dataset's corresponding field like "query". +> Note : To use benchmarking datasets, set argument `data_mode=benchmarking`. Similarly, to use custom datasets, set `data_mode=local`. +### Model +AutoEval can run in 3 evaluation modes - +1. `evaluation_mode="endpoint"` uses HuggingFace endpoint. +- We recommend launching a HuggingFace endpoint on Gaudi AI accelerator machines to ensure maximum usage and performance. +- To launch HF endpoint on Gaudi2, please follow the 2-step instructions here - [tgi-gaudi](https://github.com/huggingface/tgi-gaudi). +- Pass your endpoint url as `model_name` argument. +2. `evaluation_mode="openai"` uses openai backend. +- Please set your `openai_key` and your choice of model as `model_name` argument. +3. `evaluation_mode="local"` uses your local hardware. +- Set `hf_token` argument and set your favourite open-source model in `model_name` argument. +- GPU usage will be prioritized after checking it's availability. If GPU is unavailable, the model will run on CPU. +## Metrics +AutoEval provides 4 metrics - factualness, correctness, relevance and readability. You can also bring your own metrics and grading scales. Don't forget to add your metric to `evaluation_metrics` argument. +## Generation configuration +Please set generation parameters as per your requirement in `GENERATION_CONFIG` in `run_eval.py`. + +## Run using HF endpoint +```python3 +dataset = "explodinggradients/ragas-wikiqa" +data_mode = "benchmarking" +field_map = {"question": "question", "answer": "generated_with_rag", "context": "context"} + +template_dir = "auto_eval_metrics" + +evaluation_mode = "endpoint" + +host_ip = os.getenv("host_ip", "localhost") +port = os.getenv("port", "") +model_name = f"http://{host_ip}:{port}" + +evaluation_metrics = ["factualness", "relevance", "correctness", "readability"] + +evaluator = AutoEvaluate( + dataset=dataset, + data_mode=data_mode, + field_map=field_map, + template_dir=template_dir, + evaluation_mode=evaluation_mode, + model_name=model_name, + evaluation_metrics=evaluation_metrics, + debug_mode=True, +) + +responses = evaluator.measure() + +for response in responses: + print(response) +``` +That's it! For troubleshooting, please submit an issue and we will get right on it. diff --git a/evals/metrics/auto_eval/__init__.py b/evals/metrics/auto_eval/__init__.py new file mode 100644 index 00000000..e4892b7d --- /dev/null +++ b/evals/metrics/auto_eval/__init__.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# + +from .run_eval import AutoEvaluate + +__all__ = [AutoEvaluate] diff --git a/evals/metrics/auto_eval/prompt_engineering.py b/evals/metrics/auto_eval/prompt_engineering.py new file mode 100644 index 00000000..3ab6e7e1 --- /dev/null +++ b/evals/metrics/auto_eval/prompt_engineering.py @@ -0,0 +1,77 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from jinja2 import Template + +from .prompt_templates import * +from .prompt_templates import NAME2METRIC + + +class Prompt: + """Class to customize prompt template using user-defined list of metrics.""" + + def __init__(self, metrics, input_fields): + self.metrics = metrics + self.input_fields = input_fields + self.template = self.load_prompt_template() + + def create_grading_format(self): + grading_format = ( + "You must ALWAYS provide every single one of the scores and reasonings in the following JSON format:" + ) + grading_format += "\n" + "{" + "\n" + content = [] + reasoning_prompt = "Reasoning for {}: [your one line step by step reasoning about the {} of the answer]" + scoring_prompt = "Score for {}: [your score number for the {} of the answer]" + for metric in self.metrics: + reasoning = reasoning_prompt.format(metric, metric) + score = scoring_prompt.format(metric, metric) + content += (reasoning + "\n" + score,) + grading_format += "\n\n".join(content) + grading_format += "\n" + "}" + return grading_format + + def create_closing_prompt(self): + closing_prompt = ["Let's begin!"] + for f in self.input_fields: + closing_prompt += ("Provided {}:".format(f) + "\n" + "{{" + f + "}}",) + return "\n\n".join(closing_prompt) + + def load_prompt_template(self): + content = [] + for metric_name in ["opening_prompt"] + self.metrics: + metric_instance = NAME2METRIC[metric_name] + content += (metric_instance.template,) + content += (self.create_grading_format(),) + content += (self.create_closing_prompt(),) + return Template("\n\n".join(content)) + + def render_prompt(self, **kwargs) -> str: + text = self.template.render(**kwargs) + return text + + +if __name__ == "__main__": + + """Here, we test implementation of Prompt class.""" + + # step 0 - user input + metrics = ["factualness", "relevance", "correctness", "readability"] + input_fields = ["question", "answer", "context"] + + # step 1 - load prompt using Prompt class + prompt = Prompt(metrics=metrics, input_fields=input_fields) + + example = { + "question": "Who is wife of Barak Obama", + "context": "Michelle Obama, wife of Barak Obama (former President of the United States of America) is an attorney. Barak and Michelle Obama have 2 daughters - Malia and Sasha", + "answer": "Michelle Obama", + "ground_truth": "Wife of Barak Obama is Michelle Obama", + } + + # step 2 - render prompt with given inputs + rendered_prompt = prompt.render_prompt( + question=example["question"], answer=example["answer"], context=example["context"] + ) + + print(rendered_prompt) diff --git a/evals/metrics/auto_eval/prompt_templates/__init__.py b/evals/metrics/auto_eval/prompt_templates/__init__.py new file mode 100644 index 00000000..2b3979ba --- /dev/null +++ b/evals/metrics/auto_eval/prompt_templates/__init__.py @@ -0,0 +1,21 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from .opening_prompt import OpeningPrompt + +from .correctness import Correctness +from .factualness import Factualness +from .relevance import Relevance +from .readability import Readability + +__all__ = ["opening_prompt", "correctness", "factualness", "relevance", "readability"] + +NAME2METRIC = {} + + +def snake2camel(s): + return "".join(x.capitalize() or "_" for x in s.split("_")) + + +for name in __all__: + NAME2METRIC[name] = eval(snake2camel(name)) diff --git a/evals/metrics/auto_eval/prompt_templates/correctness.py b/evals/metrics/auto_eval/prompt_templates/correctness.py new file mode 100644 index 00000000..a328d3d2 --- /dev/null +++ b/evals/metrics/auto_eval/prompt_templates/correctness.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +class Correctness: + name = "correctness" + required_columns = ["answer", "context", "question"] + template = """- Correctness: correctness measures how accurately and comprehensively does the answer resolve problem posed in the question. + - Score 1: If the answer is empty string or something like "I do not know the answer", the correctness score is 1. + - Score 2: If the answer only addresses a small part of the question correctly or it is missing many critical steps/aspects of the answer or the answer is too short to fully answer the question or is missing many steps causing the answer to not fully address the problem described in the question, then the correctness score is 2. + - Score 3: The answer mostly addresses the question but one critical aspect/step is missing or is incorrect. + - Score 4: the answer mostly answer the question and covers all critical/main aspects of the question, but it’s missing important/necessary details about one or more aspects. + - Score 5: the answer correctly and completely addresses the query. It also covers important details about each step.""" diff --git a/evals/metrics/auto_eval/prompt_templates/factualness.py b/evals/metrics/auto_eval/prompt_templates/factualness.py new file mode 100644 index 00000000..7fa6dfee --- /dev/null +++ b/evals/metrics/auto_eval/prompt_templates/factualness.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +class Factualness: + name = "factualness" + required_columns = ["answer", "context"] + template = """- Factualness: Factualness assesses how much of the provided answer is contained within the provided context. A higher score indicates that a higher proportion of claims present in the answer are present or can be derived from the provided context. + - Score 1: the answer is completely hallucinated i.e. not contained in the context at all or there is no answer. + - Score 2: only a small part of the answer is contained in the context but most of it is imaginary/hallucinated or the meaning is completely changed from what is represented in the context. + - Score 3: Only about half of the answer is contained in the context. Rest of the answer is hallucinated or imaginary. + - Score 4: Most of the claims in the answer can be inferred from the provided context with very little information that is not directly supported by the provided context. + - Score 5: All of the claims in the answer are directly supported by the provided context, demonstrating high faithfulness to the provided context.""" diff --git a/evals/metrics/auto_eval/prompt_templates/opening_prompt.py b/evals/metrics/auto_eval/prompt_templates/opening_prompt.py new file mode 100644 index 00000000..441f371e --- /dev/null +++ b/evals/metrics/auto_eval/prompt_templates/opening_prompt.py @@ -0,0 +1,21 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +class OpeningPrompt: + name = "opening_prompt" + required_columns = [] + + template = """Consider yourself as an helpful, truthful and impartial judge. + +Your task: +You will be given an input consisting of a question, an answer and a context. Your task is to act as an impartial judge and provide a numerical score between 1 to 5 for each of the following metrics for the given answer. + +Important rules for you while completing this task: +1. You MUST ALWAYS provide a score for every metric mentioned below. +2. Make sure to understand definition of every metric fully before completing your task. Every metric is provided with grading scale and rubric. You MUST use this grading scale and rubric to determine your score. +3. Ensure that your scores and reasoning for every metric is independent of each other e.g., score for factualness should not impact score for correctness and vice versa. +4. Base your grading decision only on the given inputs and do not speculate or hallucinate. +5. You must also provide reasoning for your score in a single sentence. + +Your metric definitions along with grading scale and rubric:""" diff --git a/evals/metrics/auto_eval/prompt_templates/readability.py b/evals/metrics/auto_eval/prompt_templates/readability.py new file mode 100644 index 00000000..4c03e6e7 --- /dev/null +++ b/evals/metrics/auto_eval/prompt_templates/readability.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +class Readability: + name = "readability" + required_columns = ["answer"] + template = """- Readability: Readability measures clarity and lucidity of the answer. Readability is measured solely based on the answer and it does not consider the question or the context. + - Score 1: the answer is empty or "I do not know the answer" or completely unreadable or No meaningful information can be extracted from the answer, then the score is 1. + - Score 2: the answer is slightly readable, there are irrelevant symbols or HTML tags or repeated words, but it can roughly form a meaningful sentence that can cover some aspects of the answer. + - Score 3: Answer can be read but there are grammatical mistakes in the answer. + - Score 4: the answer readable, but the readability and style can improved to better appeal to the reader. + - Score 5: the answer is reader friendly and well written.""" diff --git a/evals/metrics/auto_eval/prompt_templates/relevance.py b/evals/metrics/auto_eval/prompt_templates/relevance.py new file mode 100644 index 00000000..33743ecc --- /dev/null +++ b/evals/metrics/auto_eval/prompt_templates/relevance.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +class Relevance: + name = "relevance" + required_columns = ["question", "answer"] + template = """- Relevance: Relevance measures how well the answer relates to the question. + - Score 1: The answer doesn't mention anything about the question or is completely irrelevant to the question. + - Score 2: The answer only identifies the domain (e.g. cnvrg) mentioned in the question and provides information from the correct domain. But, the answer does not address the question itself and the point of the question is completely missed by it. + - Score 3: The answer correctly identifies the domain and essence of the question but the details in the answer are not relevant to the focus of the question. + - Score 4: The answer correctly identifies domain mentioned the question and essence of the question as well as stays consistent with both of them. But there is some part of the answer that is not relevant to the question or it's topic or it's essence. This irrelevant part is damaging the overall relevance of the answer. + - Score 5: The answer is completely relevant to the question and the details do not deviate from the essence of the question. There are no parts of the answer that are irrelevant or unnecessary for the given question.""" diff --git a/evals/metrics/auto_eval/rag_dataset.py b/evals/metrics/auto_eval/rag_dataset.py new file mode 100644 index 00000000..a955eae6 --- /dev/null +++ b/evals/metrics/auto_eval/rag_dataset.py @@ -0,0 +1,86 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +import jsonlines +from datasets import Dataset, load_dataset + + +class RAGDataset: + """Dataset class to store data in HF datasets API format.""" + + def __init__(self, dataset, field_map, mode): + self.dataset = dataset + self.field_map = field_map + assert mode in ["local", "benchmarking"], "mode can be either local or benchmarking" + self.mode = mode + self.data = self.load_data() + self.validate_dataset() + + def load_data(self): + if self.mode == "local": + assert os.path.exists(self.dataset), "There is no such file - {}".format(self.dataset) + with jsonlines.open(self.dataset) as reader: + data = [] + for obj in reader: + ex = {} + for out_field, in_field in self.field_map.items(): + if type(obj[in_field]) == list: + ex[out_field] = "\n".join(obj[in_field]) + else: + ex[out_field] = obj[in_field] + data.append(ex) + return Dataset.from_list(data) + else: + data = [] + for obj in load_dataset(self.dataset)["train"]: + ex = {} + for out_field, in_field in self.field_map.items(): + if type(obj[in_field]) == list: + ex[out_field] = "\n".join(obj[in_field]) + else: + ex[out_field] = obj[in_field] + data.append(ex) + return Dataset.from_list(data) + + def validate_dataset(self): + for i, example in enumerate(self.data): + for out_field in self.field_map: + assert out_field in example, "Example {} does not have {} field".format(i + 1, out_field) + + def __getitem__(self, index): + return self.data[index] + + def __len__(self): + return len(self.data) + + def __iter__(self): + return iter(self.data) + + +if __name__ == "__main__": + + dataset_path = "../../benchmark/ragas/ground_truth.jsonl" + field_map = { + "question": "question", + "ground_truth": "ground_truth", + "context": "context", + } + + ds = RAGDataset(dataset=dataset_path, field_map=field_map, mode="local") + + for i, ex in enumerate(ds): + assert ex["question"] == ds[i]["question"], "index {} does not have correct query".format(i) + + dataset = "explodinggradients/ragas-wikiqa" + field_map = { + "question": "question", + "answer": "generated_with_rag", + "context": "context", + "ground_truth": "correct_answer", + } + ds = RAGDataset(dataset=dataset, field_map=field_map, mode="benchmarking") + + for i, ex in enumerate(ds): + assert ex["question"] == ds[i]["question"], "index {} does not have correct query".format(i) diff --git a/evals/metrics/auto_eval/run_eval.py b/evals/metrics/auto_eval/run_eval.py new file mode 100644 index 00000000..affc9f3c --- /dev/null +++ b/evals/metrics/auto_eval/run_eval.py @@ -0,0 +1,105 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import time + +from huggingface_hub import login + +from .prompt_engineering import Prompt +from .rag_dataset import RAGDataset +from .utils.helper import * +from .utils.model import * + + +class AutoEvaluate: + + def __init__( + self, + dataset, + data_mode, + field_map, + evaluation_mode, + model_name, + evaluation_metrics, + hf_token=None, + openai_key=None, + debug_mode=None, + ): + self.GENERATION_CONFIG = { + "openai": {"temperature": 0.1}, + "endpoint": {"max_tokens": 500}, + "local": {"max_new_tokens": 500}, + } + self.data = RAGDataset(dataset=dataset, field_map=field_map, mode=data_mode) + self.evaluator = self.get_evaluator(evaluation_mode, model_name, openai_key, hf_token) + self.prompt_template = self.get_template(evaluation_metrics, field_map) + self.debug_mode = debug_mode + self.generation_config = self.GENERATION_CONFIG[evaluation_mode] + + def get_evaluator(self, evaluation_mode, model_name, openai_key=None, hf_token=None): + if evaluation_mode == "openai": + print("Using {} openai key".format(openai_key)) + evaluator = OAIEvaluator(openai_key, model_name) + elif evaluation_mode == "endpoint": + print("Loading HF endpoint at {}".format(model_name)) + evaluator = EndpointEvaluator(model_name) + else: + assert evaluation_mode == "local", "evaluation mode must be openai / endpoint / local" + print("Loading {} model locally".format(model_name)) + login(token=hf_token, add_to_git_credential=True) + evaluator = HFEvaluator(model_name) + return evaluator + + def get_template(self, evaluation_metrics, field_map): + prompt = Prompt(metrics=evaluation_metrics, input_fields=field_map) + return prompt.template + + def measure(self): + n_samples = 1 if self.debug_mode else len(self.data) + responses = [""] * n_samples + start = time.time() + for i in range(n_samples): + prompt = render_prompt( + self.prompt_template, + query=self.data[i]["question"], + answer=self.data[i]["answer"], + context=self.data[i]["context"], + ) + messages = [{"role": "user", "content": prompt}] + response = self.evaluator.generate(messages, **self.generation_config) + responses[i] = response + end = time.time() + print("Generation of scores and reasoning took {:.2f} seconds for {:,} examples".format(end - start, n_samples)) + return responses + + +if __name__ == "__main__": + + dataset = "explodinggradients/ragas-wikiqa" + data_mode = "benchmarking" + field_map = {"question": "question", "answer": "generated_with_rag", "context": "context"} + + # evaluation_mode = "endpoint" + # model_name = f"http://{host_ip}:{port}" + + evaluation_mode = "openai" + openai_key = "" + model_name = "gpt-4o" + + evaluation_metrics = ["factualness", "relevance", "correctness", "readability"] + + evaluator = AutoEvaluate( + dataset=dataset, + data_mode=data_mode, + field_map=field_map, + evaluation_mode=evaluation_mode, + model_name=model_name, + evaluation_metrics=evaluation_metrics, + openai_key=openai_key, + debug_mode=True, + ) + + responses = evaluator.measure() + + for response in responses: + print(response) diff --git a/evals/metrics/auto_eval/utils/__init__.py b/evals/metrics/auto_eval/utils/__init__.py new file mode 100644 index 00000000..c3d7e5cf --- /dev/null +++ b/evals/metrics/auto_eval/utils/__init__.py @@ -0,0 +1,7 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import sys +import os + +sys.path.append(os.path.dirname(os.path.abspath(__file__))) diff --git a/evals/metrics/auto_eval/utils/helper.py b/evals/metrics/auto_eval/utils/helper.py new file mode 100644 index 00000000..71fdef65 --- /dev/null +++ b/evals/metrics/auto_eval/utils/helper.py @@ -0,0 +1,88 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +import re + +import numpy as np +import pandas as pd +import yaml +from jinja2 import Template +from scipy.stats import pearsonr +from sklearn.metrics import mean_squared_error + + +def load_jsonl(data_path): + result = [] + with open(data_path, "r") as f: + for line in f: + data = json.loads(line) + result.append(data) + return result + + +def load_config(config_path): + + with open(config_path, "r") as file: + config = yaml.safe_load(file) + + return config + + +def compute_mse(x, y): + return mean_squared_error(x, y) + + +def compute_pearson(x, y): + corr, _ = pearsonr(x, y) + return corr + + +def extract_delay_from_rate_limit_error_msg(text): + import re + + pattern = r"retry after (\d+)" + match = re.search(pattern, text) + if match: + retry_time_from_message = match.group(1) + return float(retry_time_from_message) + else: + return 5 + + +def render_prompt(template: Template, **kwargs) -> str: + text = template.render(**kwargs) + return text + + +def extract_score(pattern: str, text: str): + match = re.search(pattern, text.lower()) + + if match: + score = int(match.group(1)) + else: + score = 1 + + return score + + +def compute_metric_wise_assessment(metrics, groundtruth, prediction): + fine_grained_evaluation = pd.DataFrame(index=metrics) + for i, metric in enumerate(metrics): + fine_grained_evaluation.loc[metric, "MSE"] = compute_mse(groundtruth[i], prediction[i]) + abs_diff = [abs(g - p) for g, p in zip(groundtruth[i], prediction[i])] + for diff in [0, 1, 2]: + fine_grained_evaluation.loc[metric, "|label - score| <= {}".format(diff)] = sum( + val <= diff for val in abs_diff + ) + return fine_grained_evaluation + + +def compute_weighted_assessment(weights, groundtruth, prediction): + weights, groundtruth, prediction = np.array(weights), np.array(groundtruth), np.array(prediction) + weighted_labels = np.sum(weights[:, np.newaxis] * groundtruth, axis=0) + weighted_scores = np.sum(weights[:, np.newaxis] * prediction, axis=0) + mse = compute_mse(weighted_labels, weighted_scores) + pearson_correlation = compute_pearson(weighted_labels, weighted_scores) + return mse, pearson_correlation diff --git a/evals/metrics/auto_eval/utils/model.py b/evals/metrics/auto_eval/utils/model.py new file mode 100644 index 00000000..1c46d959 --- /dev/null +++ b/evals/metrics/auto_eval/utils/model.py @@ -0,0 +1,84 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from typing import List + +import openai +import torch +from huggingface_hub import InferenceClient +from transformers import AutoTokenizer, pipeline + +from .helper import extract_delay_from_rate_limit_error_msg +from .retry import retry_and_handle_exceptions + + +class EndpointEvaluator: + def __init__(self, model_name): + self.client = InferenceClient(base_url="{}/v1/chat/completions".format(model_name)) + + def generate(self, messages, **kwargs): + output = self.client.chat.completions.create( + model="tgi", + messages=messages, + stream=True, + **kwargs, + ) + response = [chunk.choices[0].delta.content for chunk in output] + response = [content for content in response if content] + response = " ".join(response) + return response + + +class HFEvaluator: + def __init__(self, model_name): + self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + device_map = "auto" if torch.cuda.is_available() else "cpu" + if device_map == "cpu": + self.pipe = pipeline( + "text-generation", + model=model_name, + tokenizer=self.tokenizer, + torch_dtype=torch.bfloat16, + device_map="cpu", + ) + else: + self.pipe = pipeline( + "text-generation", + model=model_name, + tokenizer=self.tokenizer, + torch_dtype=torch.float16, + device_map="auto", + ) + + def generate(self, messages, **kwargs) -> List[float]: + + prompt = self.pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + outputs = self.pipe(prompt, **kwargs, return_full_text=False) + result = outputs[0]["generated_text"] + return result + + +class OAIEvaluator: + def __init__(self, openai_key, model_name): + openai.api_key = openai_key + self.model_name = model_name + + @retry_and_handle_exceptions( + exception_to_check=( + openai.RateLimitError, + openai.APIError, + KeyError, + ), + max_retries=5, + extract_delay_from_error_message=extract_delay_from_rate_limit_error_msg, + ) + def generate(self, messages: list, **kwargs) -> List[float]: + return ( + openai.chat.completions.create( + model=self.model_name, + messages=messages, + **kwargs, + ) + .choices[0] + .message.content + ) diff --git a/evals/metrics/auto_eval/utils/retry.py b/evals/metrics/auto_eval/utils/retry.py new file mode 100644 index 00000000..bde26409 --- /dev/null +++ b/evals/metrics/auto_eval/utils/retry.py @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import functools +import random +import time +from typing import Optional, Tuple, Union + + +def retry_and_handle_exceptions( + exception_to_check: Union[Exception, Tuple[Exception]], + max_retries: int = 3, + initial_delay: float = 1, + exponential_base: float = 2, + jitter: bool = False, + extract_delay_from_error_message: Optional[any] = None, +): + def deco_retry(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + delay = initial_delay + for i in range(max_retries): + try: + return func(*args, **kwargs) + except exception_to_check as e: + if i == max_retries - 1: + raise Exception("Func execution failed after {0} retries: {1}".format(max_retries, e)) + delay *= exponential_base * (1 + jitter * random.random()) + delay_from_error_message = None + if extract_delay_from_error_message is not None: + delay_from_error_message = extract_delay_from_error_message(str(e)) + final_delay = delay_from_error_message if delay_from_error_message else delay + print("Func execution failed. Retrying in {0} seconds: {1}".format(final_delay, e)) + time.sleep(final_delay) + + return wrapper + + return deco_retry diff --git a/tests/requirements.txt b/tests/requirements.txt index 9c6ead5c..3f809f3e 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,6 +1,9 @@ bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@e5c2f31625223431d7987f43b70b75b9d26ba118 +datasets jieba +jsonlines langchain_community langchain_huggingface lm-eval==0.4.3 +openai ragas==0.1.19 diff --git a/tests/test_auto_eval.py b/tests/test_auto_eval.py new file mode 100644 index 00000000..cc5554f7 --- /dev/null +++ b/tests/test_auto_eval.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import unittest + +from evals.metrics.auto_eval import AutoEvaluate + +host_ip = os.getenv("host_ip", "localhost") +port = os.getenv("port", "8008") + + +class TestRagasMetric(unittest.TestCase): + + # @unittest.skip("need pass localhost id") + def test_ragas(self): + + dataset = "explodinggradients/ragas-wikiqa" + data_mode = "benchmarking" + field_map = {"question": "question", "answer": "generated_with_rag", "context": "context"} + + # evaluation_mode = "openai" + # model_name = "gpt-4o" + # openai_key = "" + + evaluation_mode = "endpoint" + model_name = f"http://{host_ip}:{port}" + + # evaluation_mode = "local" + # model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1" + # hf_token = "" + + evaluation_metrics = ["factualness", "relevance", "correctness", "readability"] + + evaluator = AutoEvaluate( + dataset=dataset, + data_mode=data_mode, + field_map=field_map, + evaluation_mode=evaluation_mode, + model_name=model_name, + evaluation_metrics=evaluation_metrics, + # openai_key=openai_key, + # hf_token=hf_token, + debug_mode=True, + ) + + responses = evaluator.measure() + + for response in responses: + print(response) + + +if __name__ == "__main__": + unittest.main()