opea-project · adkakne · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024 · Sep 20, 2024
@@ -0,0 +1 @@
+OPENAI_KEY=xxx
@@ -0,0 +1,57 @@
+# Auto (annotation-free) Evaluation of Retrieval Augmented Generation 
+
+We provide easy-to-use, flexible and annotation-free RAG evaluation tool using LLM-as-a-judge while benefitting from Intel's Gaudi2 AI accelator chips. 
+
+## Overview
+### Data 
+AutoEval is best suited for Long Form Question Answering (LFQA) datasets where you want to gauge quality and factualness of the answer via LLM's intelligence. Here, you can use benchmarking datasets or bring your own custom datasets. Please make sure to set `field_map` to map AutoEval fields such as "question" to your dataset's corresponding field like "query". 
+> Note : To use benchmarking datasets, set argument `data_mode=benchmarking`. Similarly, to use custom datasets, set `data_mode=local`.
+### Model
+AutoEval can run in 3 evaluation modes - 
+1. `evaluation_mode="endpoint"` uses HuggingFace endpoint. 
+- We recommend launching a HuggingFace endpoint on Gaudi AI accelerator machines to ensure maximum usage and performance. 
+- To launch HF endpoint on Gaudi2, please follow the 2-step instructions here - [tgi-gaudi](https://github.com/huggingface/tgi-gaudi). 
+- Pass your endpoint url as `model_name` argument. 
+2. `evaluation_mode="openai"` uses openai backend. 
+- Please set your `openai_key` and your choice of model as `model_name` argument.
+3. `evaluation_mode="local"` uses your local hardware. 
+- Set `hf_token` argument and set your favourite open-source model in `model_name` argument. 
+- GPU usage will be prioritized after checking it's availability. If GPU is unavailable, the model will run on CPU. 
+## Metrics
+AutoEval provides 4 metrics - factualness, correctness, relevance and readability. You can also bring your own metrics and grading scales. Don't forget to add your metric to `evaluation_metrics` argument. 
+## Generation configuration 
+Please set generation parameters as per your requirement in `GENERATION_CONFIG` in `run_eval.py`. 
+
+## Run using HF endpoint 
+```python3
+dataset = "explodinggradients/ragas-wikiqa"
+data_mode = "benchmarking"
+field_map = {"question": "question", "answer": "generated_with_rag", "context": "context"}
+
+template_dir = "auto_eval_metrics"
+
+evaluation_mode = "endpoint"
+
+host_ip = os.getenv("host_ip", "localhost")
+port = os.getenv("port", "<add your port where your endpoint is running>")
+model_name = f"http://{host_ip}:{port}"
+
+evaluation_metrics = ["factualness", "relevance", "correctness", "readability"]
+
+evaluator = AutoEvaluate(
+    dataset=dataset,
+    data_mode=data_mode,
+    field_map=field_map,
+    template_dir=template_dir,
+    evaluation_mode=evaluation_mode,
+    model_name=model_name,
+    evaluation_metrics=evaluation_metrics,
+    debug_mode=True,
+)
+
+responses = evaluator.measure()
+
+for response in responses:
+    print(response)
+```
+That's it! For troubleshooting, please submit an issue and we will get right on it. 
@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+#
+
+from .run_eval import AutoEvaluate
+
+__all__ = [AutoEvaluate]
@@ -0,0 +1,77 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from jinja2 import Template
+
+from .prompt_templates import *
+from .prompt_templates import NAME2METRIC
+
+
+class Prompt:
+    """Class to customize prompt template using user-defined list of metrics."""
+
+    def __init__(self, metrics, input_fields):
+        self.metrics = metrics
+        self.input_fields = input_fields
+        self.template = self.load_prompt_template()
+
+    def create_grading_format(self):
+        grading_format = (
+            "You must ALWAYS provide every single one of the scores and reasonings in the following JSON format:"
+        )
+        grading_format += "\n" + "{" + "\n"
+        content = []
+        reasoning_prompt = "Reasoning for {}: [your one line step by step reasoning about the {} of the answer]"
+        scoring_prompt = "Score for {}: [your score number for the {} of the answer]"
+        for metric in self.metrics:
+            reasoning = reasoning_prompt.format(metric, metric)
+            score = scoring_prompt.format(metric, metric)
+            content += (reasoning + "\n" + score,)
+        grading_format += "\n\n".join(content)
+        grading_format += "\n" + "}"
+        return grading_format
+
+    def create_closing_prompt(self):
+        closing_prompt = ["Let's begin!"]
+        for f in self.input_fields:
+            closing_prompt += ("Provided {}:".format(f) + "\n" + "{{" + f + "}}",)
+        return "\n\n".join(closing_prompt)
+
+    def load_prompt_template(self):
+        content = []
+        for metric_name in ["opening_prompt"] + self.metrics:
+            metric_instance = NAME2METRIC[metric_name]
+            content += (metric_instance.template,)
+        content += (self.create_grading_format(),)
+        content += (self.create_closing_prompt(),)
+        return Template("\n\n".join(content))
+
+    def render_prompt(self, **kwargs) -> str:
+        text = self.template.render(**kwargs)
+        return text
+
+
+if __name__ == "__main__":
+
+    """Here, we test implementation of Prompt class."""
+
+    # step 0 - user input
+    metrics = ["factualness", "relevance", "correctness", "readability"]
+    input_fields = ["question", "answer", "context"]
+
+    # step 1 - load prompt using Prompt class
+    prompt = Prompt(metrics=metrics, input_fields=input_fields)
+
+    example = {
+        "question": "Who is wife of Barak Obama",
+        "context": "Michelle Obama, wife of Barak Obama (former President of the United States of America) is an attorney. Barak and Michelle Obama have 2 daughters - Malia and Sasha",
+        "answer": "Michelle Obama",
+        "ground_truth": "Wife of Barak Obama is Michelle Obama",
+    }
+
+    # step 2 - render prompt with given inputs
+    rendered_prompt = prompt.render_prompt(
+        question=example["question"], answer=example["answer"], context=example["context"]
+    )
+
+    print(rendered_prompt)
@@ -0,0 +1,21 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from .opening_prompt import OpeningPrompt
+
+from .correctness import Correctness
+from .factualness import Factualness
+from .relevance import Relevance
+from .readability import Readability
+
+__all__ = ["opening_prompt", "correctness", "factualness", "relevance", "readability"]
+
+NAME2METRIC = {}
+
+
+def snake2camel(s):
+    return "".join(x.capitalize() or "_" for x in s.split("_"))
+
+
+for name in __all__:
+    NAME2METRIC[name] = eval(snake2camel(name))
@@ -0,0 +1,13 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+class Correctness:
+    name = "correctness"
+    required_columns = ["answer", "context", "question"]
+    template = """- Correctness: correctness measures how accurately and comprehensively does the answer resolve problem posed in the question.
+  - Score 1: If the answer is empty string or something like "I do not know the answer", the correctness score is 1.
+  - Score 2: If the answer only addresses a small part of the question correctly or it is missing many critical steps/aspects of the answer or the answer is too short to fully answer the question or is missing many steps causing the answer to not fully address the problem described in the question, then the correctness score is 2.
+  - Score 3: The answer mostly addresses the question but one critical aspect/step is missing or is incorrect.
+  - Score 4: the answer mostly answer the question and covers all critical/main aspects of the question, but it’s missing important/necessary details about one or more aspects.
+  - Score 5: the answer correctly and completely addresses the query. It also covers important details about each step."""
@@ -0,0 +1,13 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+class Factualness:
+    name = "factualness"
+    required_columns = ["answer", "context"]
+    template = """- Factualness: Factualness assesses how much of the provided answer is contained within the provided context. A higher score indicates that a higher proportion of claims present in the answer are present or can be derived from the provided context.
+  - Score 1: the answer is completely hallucinated i.e. not contained in the context at all or there is no answer.
+  - Score 2: only a small part of the answer is contained in the context but most of it is imaginary/hallucinated or the meaning is completely changed from what is represented in the context.
+  - Score 3: Only about half of the answer is contained in the context. Rest of the answer is hallucinated or imaginary.
+  - Score 4: Most of the claims in the answer can be inferred from the provided context with very little information that is not directly supported by the provided context.
+  - Score 5: All of the claims in the answer are directly supported by the provided context, demonstrating high faithfulness to the provided context."""
@@ -0,0 +1,21 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+class OpeningPrompt:
+    name = "opening_prompt"
+    required_columns = []
+
+    template = """Consider yourself as an helpful, truthful and impartial judge.
+
+Your task:
+You will be given an input consisting of a question, an answer and a context. Your task is to act as an impartial judge and provide a numerical score between 1 to 5 for each of the following metrics for the given answer.
+
+Important rules for you while completing this task:
+1. You MUST ALWAYS provide a score for every metric mentioned below.
+2. Make sure to understand definition of every metric fully before completing your task. Every metric is provided with grading scale and rubric. You MUST use this grading scale and rubric to determine your score.
+3. Ensure that your scores and reasoning for every metric is independent of each other e.g., score for factualness should not impact score for correctness and vice versa.
+4. Base your grading decision only on the given inputs and do not speculate or hallucinate.
+5. You must also provide reasoning for your score in a single sentence.
+
+Your metric definitions along with grading scale and rubric:"""
@@ -0,0 +1,13 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+class Readability:
+    name = "readability"
+    required_columns = ["answer"]
+    template = """- Readability: Readability measures clarity and lucidity of the answer. Readability is measured solely based on the answer and it does not consider the question or the context.
+  - Score 1: the answer is empty or "I do not know the answer" or completely unreadable or No meaningful information can be extracted from the answer, then the score is 1.
+  - Score 2: the answer is slightly readable, there are irrelevant symbols or HTML tags or repeated words, but it can roughly form a meaningful sentence that can cover some aspects of the answer.
+  - Score 3: Answer can be read but there are grammatical mistakes in the answer.
+  - Score 4: the answer readable, but the readability and style can improved to better appeal to the reader.
+  - Score 5: the answer is reader friendly and well written."""
@@ -0,0 +1,13 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+class Relevance:
+    name = "relevance"
+    required_columns = ["question", "answer"]
+    template = """- Relevance: Relevance measures how well the answer relates to the question.
+  - Score 1: The answer doesn't mention anything about the question or is completely irrelevant to the question.
+  - Score 2: The answer only identifies the domain (e.g. cnvrg) mentioned in the question and provides information from the correct domain. But, the answer does not address the question itself and the point of the question is completely missed by it.
+  - Score 3: The answer correctly identifies the domain and essence of the question but the details in the answer are not relevant to the focus of the question.
+  - Score 4: The answer correctly identifies domain mentioned the question and essence of the question as well as stays consistent with both of them. But there is some part of the answer that is not relevant to the question or it's topic or it's essence. This irrelevant part is damaging the overall relevance of the answer.
+  - Score 5: The answer is completely relevant to the question and the details do not deviate from the essence of the question. There are no parts of the answer that are irrelevant or unnecessary for the given question."""
@@ -0,0 +1,86 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import jsonlines
+from datasets import Dataset, load_dataset
+
+
+class RAGDataset:
+    """Dataset class to store data in HF datasets API format."""
+
+    def __init__(self, dataset, field_map, mode):
+        self.dataset = dataset
+        self.field_map = field_map
+        assert mode in ["local", "benchmarking"], "mode can be either local or benchmarking"
+        self.mode = mode
+        self.data = self.load_data()
+        self.validate_dataset()
+
+    def load_data(self):
+        if self.mode == "local":
+            assert os.path.exists(self.dataset), "There is no such file - {}".format(self.dataset)
+            with jsonlines.open(self.dataset) as reader:
+                data = []
+                for obj in reader:
+                    ex = {}
+                    for out_field, in_field in self.field_map.items():
+                        if type(obj[in_field]) == list:
+                            ex[out_field] = "\n".join(obj[in_field])
+                        else:
+                            ex[out_field] = obj[in_field]
+                    data.append(ex)
+            return Dataset.from_list(data)
+        else:
+            data = []
+            for obj in load_dataset(self.dataset)["train"]:
+                ex = {}
+                for out_field, in_field in self.field_map.items():
+                    if type(obj[in_field]) == list:
+                        ex[out_field] = "\n".join(obj[in_field])
+                    else:
+                        ex[out_field] = obj[in_field]
+                data.append(ex)
+            return Dataset.from_list(data)
+
+    def validate_dataset(self):
+        for i, example in enumerate(self.data):
+            for out_field in self.field_map:
+                assert out_field in example, "Example {} does not have {} field".format(i + 1, out_field)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def __len__(self):
+        return len(self.data)
+
+    def __iter__(self):
+        return iter(self.data)
+
+
+if __name__ == "__main__":
+
+    dataset_path = "../../benchmark/ragas/ground_truth.jsonl"
+    field_map = {
+        "question": "question",
+        "ground_truth": "ground_truth",
+        "context": "context",
+    }
+
+    ds = RAGDataset(dataset=dataset_path, field_map=field_map, mode="local")
+
+    for i, ex in enumerate(ds):
+        assert ex["question"] == ds[i]["question"], "index {} does not have correct query".format(i)
+
+    dataset = "explodinggradients/ragas-wikiqa"
+    field_map = {
+        "question": "question",
+        "answer": "generated_with_rag",
+        "context": "context",
+        "ground_truth": "correct_answer",
+    }
+    ds = RAGDataset(dataset=dataset, field_map=field_map, mode="benchmarking")
+
+    for i, ex in enumerate(ds):
+        assert ex["question"] == ds[i]["question"], "index {} does not have correct query".format(i)