diff --git a/notebooks/__init__.py b/notebooks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/notebooks/evaluation/rag_experiments.py b/notebooks/evaluation/rag_experiments.py
new file mode 100644
index 000000000..d64f7fe4e
--- /dev/null
+++ b/notebooks/evaluation/rag_experiments.py
@@ -0,0 +1,510 @@
+import json
+import sys
+from dataclasses import asdict
+from operator import itemgetter
+from pathlib import Path
+from typing import Annotated
+
+import click
+import jsonlines
+import pandas as pd
+import seaborn as sns
+from deepeval import evaluate
+from deepeval.dataset import EvaluationDataset
+from deepeval.metrics import (
+    AnswerRelevancyMetric,
+    ContextualPrecisionMetric,
+    ContextualRecallMetric,
+    ContextualRelevancyMetric,
+    FaithfulnessMetric,
+    HallucinationMetric,
+)
+from dotenv import find_dotenv, load_dotenv
+from elasticsearch import Elasticsearch
+from elasticsearch.helpers import bulk, scan
+from fastapi import Depends
+from langchain.globals import set_verbose
+from langchain.schema import StrOutputParser
+from langchain_community.chat_models import ChatLiteLLM
+from langchain_core.retrievers import BaseRetriever
+from langchain_core.runnables import ConfigurableField, Runnable, RunnableLambda, RunnablePassthrough
+from langchain_core.vectorstores import VectorStoreRetriever
+from scipy import stats
+from tiktoken import Encoding
+
+# Temp hack - there is an issue with the importing of redbox-core
+sys.path.append(str(Path(__file__).parents[2]))
+
+from core_api.src import dependencies
+from core_api.src.dependencies import get_tokeniser
+from core_api.src.format import format_documents
+from core_api.src.retriever import ParameterisedElasticsearchRetriever
+from core_api.src.runnables import make_chat_prompt_from_messages_runnable
+from redbox.models import ChatRoute, Settings
+from redbox.models.chain import ChainInput
+from redbox.models.file import UUID
+from redbox.models.settings import ElasticLocalSettings
+
+set_verbose(False)
+
+
+_ = load_dotenv(find_dotenv())
+
+
+class GetExperimentResults:
+    """
+    Class to handle experiment results retrieval and processing.
+
+    Attributes:
+        data_version (str): Data version to be used in experiment
+        benchmark (bool): Benchmark stated or not
+        V_EMBEDDINGS (str): Embeddings version
+        V_ROOT (path): Path to where the root of experiment data lies
+        V_SYNTHETIC (path): Path to the synthetic data
+        V_RESULTS (path): Path to the results from data processing
+        MODEL (str): Embedding model
+        ES_CLIENT (func): Elastic search client from settings
+        INDEX (str): Index for the json lines file
+        experiment_name (str): Name of the experiment
+    """
+
+    def __init__(self):
+        self.data_version = None
+        self.benchmark = None
+        self.V_EMBEDDINGS = ""
+        self.V_ROOT = None
+        self.V_SYNTHETIC = None
+        self.V_RESULTS = None
+        self.MODEL = None
+        self.ES_CLIENT = None
+        self.INDEX = None
+        self.experiment_name = None
+        self.retrieval_system_prompt = None
+        self.retrieval_question_prompt = None
+        self.eval_results = None
+        self.ENV = Settings(minio_host="localhost", elastic=ElasticLocalSettings(host="localhost"))
+        self.LLM = ChatLiteLLM(
+            model="gpt-4o",
+            streaming=True,
+        )
+        self.FILE_UUIDS = None
+        self.USER_UUID = UUID("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa")
+        self.experiment_file_name = None
+        self.experiment_parameters = None
+
+    def set_data_version(self, data_version):
+        """
+        This function sets the necessary environment variables depending on your data version.
+        It assumes you have a versioned evaluation folder in your repository e.g. notebooks/evaluation/data/0.2.0
+        This should be copied from the Redbox shared Google Drive.
+        This folder contains the raw files, QA sets, embeddings etc.
+        """
+        self.data_version = data_version
+        root = Path(__file__).parents[2]
+        evaluation_dir = root / "notebooks/evaluation"
+        self.V_ROOT = evaluation_dir / f"data/{self.data_version}"
+        self.V_SYNTHETIC = self.V_ROOT / "synthetic"
+        self.V_RESULTS = self.V_ROOT / "results"
+        self.V_EMBEDDINGS = self.V_ROOT / "embeddings"
+        self.MODEL = self.ENV.embedding_model
+        self.INDEX = f"{self.data_version}-{self.MODEL}".lower()
+        self.ES_CLIENT = self.ENV.elasticsearch_client()
+
+    def load_chunks_from_jsonl_to_index(self) -> set:
+        """
+        This function takes the versioned embeddings (e.g. from notebooks/evaluation/data/0.2.0/embeddings)
+        and loads them to ElasticSearch.
+        """
+        file_uuids = set()
+        file_path = self.V_EMBEDDINGS / f"{self.MODEL}.jsonl"
+
+        with jsonlines.open(file_path, mode="r") as reader:
+            for chunk_raw in reader:
+                chunk = json.loads(chunk_raw)
+                self.ES_CLIENT.index(
+                    index=self.INDEX,
+                    id=chunk["uuid"],
+                    body=chunk,
+                )
+
+                file_uuids.add(chunk["parent_file_uuid"])
+        self.FILE_UUIDS = file_uuids
+        return file_uuids
+
+    def clear_index(self) -> None:
+        """
+        This function clears the indexes from ElasticSearch.
+        """
+        if not self.ES_CLIENT.indices.exists(index=self.INDEX):
+            return None
+
+        documents = list(scan(self.ES_CLIENT, index=self.INDEX, query={"query": {"match_all": {}}}))
+        bulk_data = [{"_op_type": "delete", "_index": doc["_index"], "_id": doc["_id"]} for doc in documents]
+        if bulk_data:
+            return bulk(self.ES_CLIENT, bulk_data, request_timeout=300)
+        return None
+
+    def get_parameterised_retriever(
+        self, es: Annotated[Elasticsearch, Depends(dependencies.get_elasticsearch_client)]
+    ) -> BaseRetriever:
+        """
+        Creates an Elasticsearch retriever runnable.
+        Runnable takes input of a dict keyed to question, file_uuids and user_uuid.
+        Runnable returns a list of Chunks.
+        """
+        default_params = {
+            "size": self.ENV.ai.rag_k,
+            "num_candidates": self.ENV.ai.rag_num_candidates,
+            "match_boost": 1,
+            "knn_boost": 1,
+            "similarity_threshold": 0,
+        }
+
+        return ParameterisedElasticsearchRetriever(
+            es_client=es,
+            index_name=self.INDEX,
+            params=default_params,
+            embedding_model=dependencies.get_embedding_model(self.ENV),
+        ).configurable_fields(
+            params=ConfigurableField(
+                id="params",
+                name="Retriever parameters",
+                description="A dictionary of parameters to use for the retriever.",
+            )
+        )
+
+    def build_retrieval_chain(
+        self,
+        llm: Annotated[ChatLiteLLM, Depends(dependencies.get_llm)],
+        retriever: Annotated[VectorStoreRetriever, Depends(dependencies.get_parameterised_retriever)],
+        tokeniser: Annotated[Encoding, Depends(dependencies.get_tokeniser)],
+        env: Annotated[Settings, Depends(dependencies.get_env)],
+    ) -> Runnable:
+        """
+        This is an adaptation of core_api.src.build_chains.build_retrieval_chain.
+        Function experiements with different retrieval_system_prompt and retrieval_question_prompt.
+        """
+        return (
+            RunnablePassthrough.assign(documents=retriever)
+            | RunnablePassthrough.assign(
+                formatted_documents=(RunnablePassthrough() | itemgetter("documents") | format_documents)
+            )
+            | {
+                "response": make_chat_prompt_from_messages_runnable(
+                    system_prompt=str(self.retrieval_system_prompt),
+                    question_prompt=str(self.retrieval_question_prompt),
+                    input_token_budget=env.ai.context_window_size - env.llm_max_tokens,
+                    tokeniser=tokeniser,
+                )
+                | llm
+                | StrOutputParser(),
+                "source_documents": itemgetter("documents"),
+                "route_name": RunnableLambda(lambda _: ChatRoute.search.value),
+            }
+        )
+
+    def get_rag_results(
+        self,
+        question,
+    ) -> dict:
+        """
+        Get Redbox response for a given question.
+        """
+        retriever = self.get_parameterised_retriever(es=self.ES_CLIENT)
+
+        chain = self.build_retrieval_chain(llm=self.LLM, retriever=retriever, tokeniser=get_tokeniser(), env=self.ENV)
+
+        response = chain.invoke(
+            input=ChainInput(
+                question=question,
+                chat_history=[{"text": "", "role": "user"}],
+                file_uuids=list(self.FILE_UUIDS),
+                user_uuid=self.USER_UUID,
+            ).model_dump()
+        )
+
+        filtered_chunks = []
+
+        for chunk in response["source_documents"]:
+            dict_chunk = dict(chunk)
+            filtered_chunk = {
+                "page_content": dict_chunk["page_content"],
+                "page_number": dict_chunk["metadata"]["page_number"],
+                "parent_file_uuid": dict_chunk["metadata"]["parent_file_uuid"],
+            }
+            filtered_chunks.append(filtered_chunk)
+
+        return {"output_text": response["response"], "source_documents": filtered_chunks}
+
+    def write_rag_results(self) -> None:
+        """
+        Format and write Redbox responses to evaluation dataset.
+        """
+
+        synthetic_df = pd.read_csv(f"{self.V_SYNTHETIC}/ragas_synthetic_data.csv")
+        inputs = synthetic_df["input"].tolist()
+
+        df_function = synthetic_df.copy()
+
+        actual_output = []
+        retrieval_context = []
+
+        for question in inputs:
+            data = self.get_rag_results(question=question)
+            actual_output.append(data["output_text"])
+            retrieval_context.append(data["source_documents"])
+
+        df_function["actual_output"] = actual_output
+        df_function["retrieval_context"] = retrieval_context
+
+        df_function_clean = df_function.dropna()
+        df_function_clean.to_csv(
+            f"{self.V_SYNTHETIC}/{self.experiment_name}_complete_ragas_synthetic_data.csv", index=False
+        )
+
+    def do_evaluation(self) -> None:
+        """
+        Calculate evaluation metrics for a synthetic RAGAS dataset, aggregate results
+        and write as CSV.
+        """
+
+        dataset = EvaluationDataset()
+        dataset.add_test_cases_from_csv_file(
+            file_path=f"{self.V_SYNTHETIC}/{self.experiment_name}_complete_ragas_synthetic_data.csv",
+            input_col_name="input",
+            actual_output_col_name="actual_output",
+            expected_output_col_name="expected_output",
+            context_col_name="context",
+            context_col_delimiter=";",
+            retrieval_context_col_name="retrieval_context",
+            retrieval_context_col_delimiter=";",
+        )
+
+        # Instantiate retrieval metrics
+        contextual_precision = ContextualPrecisionMetric(
+            threshold=0.5,  # default is 0.5
+            model="gpt-4o",
+            include_reason=True,
+        )
+
+        contextual_recall = ContextualRecallMetric(
+            threshold=0.5,  # default is 0.5
+            model="gpt-4o",
+            include_reason=True,
+        )
+
+        contextual_relevancy = ContextualRelevancyMetric(
+            threshold=0.5,  # default is 0.5
+            model="gpt-4o",
+            include_reason=True,
+        )
+
+        # Instantiate generation metrics
+        answer_relevancy = AnswerRelevancyMetric(
+            threshold=0.5,  # default is 0.5
+            model="gpt-4o",
+            include_reason=True,
+        )
+
+        faithfulness = FaithfulnessMetric(
+            threshold=0.5,  # default is 0.5
+            model="gpt-4o",
+            include_reason=True,
+        )
+
+        hallucination = HallucinationMetric(
+            threshold=0.5,  # default is 0.5
+            model="gpt-4o",
+            include_reason=True,
+        )
+
+        self.eval_results = evaluate(
+            test_cases=dataset,
+            metrics=[
+                contextual_precision,
+                contextual_recall,
+                contextual_relevancy,
+                answer_relevancy,
+                faithfulness,
+                hallucination,
+            ],
+        )
+
+        return self.eval_results
+
+    def write_evaluation_results(self) -> None:
+        """
+        This function writes the evaluation results to a csv, identifiable by experiment_name.
+        """
+        metric_type = {
+            "metric_name": [
+                "Contextual Precision",
+                "Contextual Recall",
+                "Contextual Relevancy",
+                "Answer Relevancy",
+                "Faithfulness",
+                "Hallucination",
+            ],
+            "metric_type": ["retrieval", "retrieval", "retrieval", "generation", "generation", "generation"],
+        }
+
+        evaluation = (
+            pd.DataFrame.from_records(asdict(result) for result in self.eval_results)
+            .explode("metrics_metadata")
+            .reset_index(drop=True)
+            .assign(
+                metric_name=lambda df: df.metrics_metadata.apply(getattr, args=["metric"]),
+                score=lambda df: df.metrics_metadata.apply(getattr, args=["score"]),
+                reason=lambda df: df.metrics_metadata.apply(getattr, args=["reason"]),
+            )
+            .merge(pd.DataFrame(metric_type), on="metric_name")
+            .drop(columns=["success", "metrics_metadata"])
+        )
+
+        evaluation.to_csv(f"{self.V_RESULTS}/{self.experiment_name}_val_results.csv", index=False)
+        evaluation.head()
+
+    def load_experiment_param_data(
+        self,
+        experiment_file_name=None,
+        benchmark=None,
+    ):
+        """
+        This function loads an csv of experiments to try unless benchmark is specified;
+        in this case it will take the core_api retrieval_system_prompt and retrieval_question_prompt.
+        """
+
+        if benchmark:
+            self.benchmark = benchmark
+            self.experiment_file_name = "benchmark"
+            benchmark_df = pd.DataFrame()
+            benchmark_df["experiment_name"] = ["benchmark"]
+            benchmark_df["retrieval_system_prompt"] = [self.ENV.ai.retrieval_system_prompt]
+            benchmark_df["retrieval_question_prompt"] = [self.ENV.ai.retrieval_question_prompt]
+            self.experiment_parameters = benchmark_df
+        else:
+            self.experiment_file_name = experiment_file_name
+            self.experiment_parameters = pd.read_csv(
+                f"notebooks/evaluation/data/experiment_parameters/{self.experiment_file_name}.csv"
+            )
+
+    def loop_through_experiements(self):
+        """
+        This function calls the other functions to run and write the different experiments.
+        """
+        for _index, row in self.experiment_parameters.iterrows():
+            self.experiment_name = row["experiment_name"]
+            self.retrieval_system_prompt = (row["retrieval_system_prompt"],)
+            self.retrieval_question_prompt = row["retrieval_question_prompt"]
+
+            self.write_rag_results()
+            self.do_evaluation()
+            self.write_evaluation_results()
+
+    def empirical_ci(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Calculate confidence intervals for aggregated metrics.
+        """
+
+        df_grouped = (
+            df.groupby(["experiment_name", "metric_name"])["score"]
+            .agg(["mean", "sem", "min", "max", "count"])
+            .reset_index()
+        )
+
+        ci = stats.t.interval(
+            confidence=0.95, df=df_grouped["count"] - 1, loc=df_grouped["mean"], scale=df_grouped["sem"]
+        )
+
+        df_grouped["ci_low"] = ci[0]
+        df_grouped["ci_high"] = ci[1]
+
+        return df_grouped
+
+    def create_visualisation_plus_grouped_results(self):
+        """
+        This function uses the stored experiment result to save the aggregated metics using empirical_ci().
+        It also saves a barplot (here confidence intervals are calculated by bootstrapping).
+        """
+        experiments = []
+        experiment_names = self.experiment_parameters["experiment_name"]
+        for experiment_name in experiment_names:
+            experiment = pd.read_csv(f"{self.V_RESULTS}/{self.experiment_name}_val_results.csv")
+            experiment["experiment_name"] = experiment_name
+            experiments.append(experiment)
+
+        experiments_df = pd.concat(experiments)
+
+        barplot = sns.barplot(experiments_df, x="score", y="metric_name", hue="experiment_name", errorbar=("ci", 95))
+        fig = barplot.get_figure()
+        fig.savefig(f"{self.V_RESULTS}/{self.experiment_file_name}_boxplot.png", bbox_inches="tight")
+
+        experiment_metrics = self.empirical_ci(experiments_df)
+        experiment_metrics.to_csv(f"{self.V_RESULTS}/{self.experiment_file_name}_eval_results_full.csv")
+
+
+class Mutex(click.Option):
+    def __init__(self, *args, **kwargs):
+        self.not_required_if = kwargs.pop("not_required_if")
+        self.required_if_not_set = kwargs.pop("required_if_not_set", True)
+
+        if not self.not_required_if:
+            msg = "'not_required_if' parameter required"
+            raise ValueError(msg)
+
+        kwargs["help"] = (
+            kwargs.get("help", "") + " Option is mutually exclusive with " + ", ".join(self.not_required_if) + "."
+        ).strip()
+        super().__init__(*args, **kwargs)
+
+    def handle_parse_result(self, ctx, opts, args):
+        current_opt = self.name in opts
+        for mutex_opt in self.not_required_if:
+            if mutex_opt in opts:
+                if current_opt:
+                    msg = f"Illegal usage: '{self.name}' is mutually exclusive with {mutex_opt}."
+                    raise click.UsageError(msg)
+                self.prompt = None
+
+        if not current_opt and self.required_if_not_set and not any(opt in opts for opt in self.not_required_if):
+            msg = f"Illegal usage: Either '{self.name}' or one of {', '.join(self.not_required_if)} must be provided."
+            raise click.UsageError(msg)
+        return super().handle_parse_result(ctx, opts, args)
+
+
+@click.command()
+@click.option(
+    "--data_version",
+    required=True,
+    type=str,
+    help="Specify the data version you want to use.",
+)
+@click.option(
+    "--experiment_file_name",
+    cls=Mutex,
+    not_required_if=["benchmark"],
+    required_if_not_set=True,
+    type=str,
+    help="Specify the experiment data file name you want to use. (CSV)",
+)
+@click.option(
+    "--benchmark",
+    "-b",
+    cls=Mutex,
+    not_required_if=["experiment_file_name"],
+    required_if_not_set=True,
+    is_flag=True,
+    help="Use the baseline rag function to get benchmarking results.",
+)
+def main(data_version, experiment_file_name, benchmark):
+    get_experiment_results = GetExperimentResults()
+    get_experiment_results.set_data_version(data_version)
+    get_experiment_results.load_experiment_param_data(experiment_file_name=experiment_file_name, benchmark=benchmark)
+    get_experiment_results.load_chunks_from_jsonl_to_index()
+    get_experiment_results.loop_through_experiements()
+    get_experiment_results.create_visualisation_plus_grouped_results()
+    get_experiment_results.clear_index()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/notebooks/evaluation/tests/test_rag_experiments.py b/notebooks/evaluation/tests/test_rag_experiments.py
new file mode 100644
index 000000000..9a6034994
--- /dev/null
+++ b/notebooks/evaluation/tests/test_rag_experiments.py
@@ -0,0 +1,212 @@
+import json
+import sys
+from pathlib import Path
+from unittest import mock
+
+import pytest
+
+# Hack - wasn't picking up imports properly
+sys.path.append(str(Path(__file__).parents[2]))
+import tempfile
+import unittest
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pandas as pd
+from elasticsearch import Elasticsearch
+from elasticsearch.helpers import scan
+from evaluation.rag_experiments import GetExperimentResults
+
+from redbox.models import Settings
+
+
+@pytest.fixture()
+def env():
+    return Settings()
+
+
+@pytest.fixture()
+def es_client():
+    with (
+        mock.patch("elasticsearch.Elasticsearch.search") as mocked_search,
+        mock.patch("elasticsearch.client.IndicesClient.create") as mocked_index_create,
+        mock.patch("elasticsearch.Elasticsearch.index") as mocked_index,
+        mock.patch("elasticsearch.helpers.scan") as mocked_scan,
+        mock.patch("elasticsearch.Elasticsearch.delete_by_query") as mocked_delete_by_query,
+    ):
+        mocked_search.return_value = {"hits": {"hits": [{"_id": "1", "_source": {"field": "value"}}]}}
+        mocked_index_create.return_value = {"acknowledged": True}
+        mocked_index.return_value = {"result": "created"}
+        mocked_delete_by_query.return_value = {"deleted": 1}
+        mocked_scan.return_value = iter(
+            [{"_id": "1", "_source": {"field": "value"}}, {"_id": "2", "_source": {"field": "value"}}]
+        )
+
+        client = Elasticsearch(hosts=["http://localhost:9200"])
+        client.indices = mock.Mock()
+        client.indices.create = mocked_index_create
+        client.search = mocked_search
+        client.index = mocked_index
+        client.delete_by_query = mocked_delete_by_query
+
+        yield client, mocked_search, mocked_index_create, mocked_index, mocked_scan, mocked_delete_by_query
+
+
+@pytest.mark.usefixtures("es_client")
+class TestGetExperimentResults(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def inject_fixtures(self, es_client):
+        (
+            self.es_client,
+            self.mocked_search,
+            self.mocked_index_create,
+            self.mocked_index,
+            self.mocked_scan,
+            self.mocked_delete_by_query,
+        ) = es_client
+
+        return es_client
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+
+    def setUp(self):
+        self.get_experiment_results = GetExperimentResults()
+        self.get_experiment_results.set_data_version("0.2.3")
+
+        self.get_experiment_results.ENV = MagicMock()
+
+        self.get_experiment_results.ES_CLIENT = self.es_client
+
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.get_experiment_results.V_RESULTS = self.temp_dir.name
+
+        self.mock_synthetic_path = "/mock/synthetic"
+        self.mock_complete_ragas_file = "mock_complete_ragas_synthetic_data.csv"
+        self.get_experiment_results.synthetic_data_dir = self.mock_synthetic_path
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    @patch("jsonlines.open")
+    def test_load_chunks_from_jsonl_to_index(self, mock_jsonlines_open):
+        mock_jsonlines_open.return_value.__enter__.return_value = iter(
+            [json.dumps({"uuid": "1234", "parent_file_uuid": "abcd", "data": "test data"})]
+        )
+
+        file_uuids = self.get_experiment_results.load_chunks_from_jsonl_to_index()
+        assert "abcd" in file_uuids
+        self.get_experiment_results.ES_CLIENT.index.assert_called_once()
+
+    @patch("jsonlines.open")
+    def test_load_chunks_from_empty_jsonl(self, mock_jsonlines_open):
+        mock_jsonlines_open.return_value.__enter__.return_value = iter([])
+
+        result = self.get_experiment_results.load_chunks_from_jsonl_to_index()
+        assert result == set()
+
+    def test_load_experiment_param_data(self):
+        with patch("pandas.read_csv") as mock_read_csv:
+            mock_read_csv.return_value = pd.DataFrame(
+                {
+                    "experiment_name": ["test_experiment"],
+                    "retrieval_system_prompt": ["test_prompt"],
+                    "retrieval_question_prompt": ["test_question_prompt"],
+                }
+            )
+
+            self.get_experiment_results.load_experiment_param_data("test_file")
+            assert self.get_experiment_results.experiment_parameters["experiment_name"][0] == "test_experiment"
+
+    @patch("evaluation.rag_experiments.GetExperimentResults.get_rag_results")
+    @patch("pandas.DataFrame.to_csv")
+    def test_write_rag_results(self, mock_to_csv, mock_get_rag_results):
+        mock_get_rag_results.return_value = {
+            "output_text": "test_output",
+            "source_documents": [{"page_content": "test_content"}],
+        }
+
+        with patch("pandas.read_csv") as mock_read_csv:
+            mock_read_csv.return_value = pd.DataFrame({"input": ["test_input"]})
+            self.get_experiment_results.write_rag_results()
+            mock_to_csv.assert_called_once()
+
+    @patch("evaluation.rag_experiments.pd.read_csv")
+    @patch("evaluation.rag_experiments.evaluate")
+    def test_do_evaluation(self, mock_evaluate, mock_read_csv):
+        # Setting up the mock return value for read_csv
+        mock_data = pd.DataFrame({"column1": [1, 2, 3], "column2": ["a", "b", "c"]})
+        mock_read_csv.return_value = mock_data
+
+        mock_eval_result = pd.DataFrame(
+            {
+                "input": ["input1", "input2", "input3"],
+                "actual_output": ["output1", "output2", "output3"],
+                "expected_output": ["expected1", "expected2", "expected3"],
+                "context": [["context1"], ["context2"], ["context3"]],
+                "retrieval_context": [["retrieval1"], ["retrieval2"], ["retrieval3"]],
+                "additional_metadata": [None, None, None],
+                "comments": [None, None, None],
+            }
+        )
+        mock_evaluate.return_value = mock_eval_result
+
+        result = self.get_experiment_results.do_evaluation()
+
+        assert result is not None
+        assert isinstance(result, pd.DataFrame)
+        assert result.shape == (3, 7)
+
+    @patch("seaborn.barplot")
+    @patch("pandas.concat")
+    @patch("pandas.read_csv")
+    def test_create_visualisation_plus_grouped_results(self, mock_read_csv, mock_concat, mock_barplot):  # noqa: ARG002
+        mock_read_csv.return_value = pd.DataFrame(
+            {"experiment_name": ["test_experiment"], "score": [0.5], "metric_name": ["test_metric"]}
+        )
+        mock_concat.return_value = mock_read_csv.return_value
+
+        self.get_experiment_results.experiment_parameters = {"experiment_name": ["test_experiment"]}
+        self.get_experiment_results.create_visualisation_plus_grouped_results()
+
+    @patch("pandas.read_csv")
+    def test_create_visualisation_empty_data(self, mock_read_csv):
+        mock_read_csv.return_value = pd.DataFrame()
+        self.get_experiment_results.experiment_parameters = {"experiment_name": []}
+
+        with pytest.raises(ValueError):  # noqa: PT011
+            self.get_experiment_results.create_visualisation_plus_grouped_results()
+
+    @patch("jsonlines.open")
+    def test_clear_index(self, mock_jsonlines_open):
+        with patch("pandas.read_csv") as mock_read_csv:
+            mock_read_csv.return_value = pd.DataFrame(
+                {
+                    "experiment_name": ["test_experiment"],
+                    "retrieval_system_prompt": ["test_prompt"],
+                    "retrieval_question_prompt": ["test_question_prompt"],
+                }
+            )
+
+            self.get_experiment_results.load_experiment_param_data("test_file")
+
+            mock_jsonlines_open.return_value.__enter__.return_value = iter(
+                [json.dumps({"uuid": "1234", "parent_file_uuid": "abcd", "data": "test data"})]
+            )
+
+            self.get_experiment_results.load_chunks_from_jsonl_to_index()
+
+            self.get_experiment_results.ES_CLIENT.index.assert_called_once()
+
+            self.get_experiment_results.clear_index()
+            documents_after_clear = list(
+                scan(self.get_experiment_results.ES_CLIENT, query={"query": {"match_all": {}}})
+            )
+            assert len(documents_after_clear) == 0
+
+            self.get_experiment_results.clear_index()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/poetry.lock b/poetry.lock
index d13ec4a80..3804dab8d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -3921,9 +3921,13 @@ files = [
     {file = "lxml-5.2.2-cp36-cp36m-win_amd64.whl", hash = "sha256:edcfa83e03370032a489430215c1e7783128808fd3e2e0a3225deee278585196"},
     {file = "lxml-5.2.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:28bf95177400066596cdbcfc933312493799382879da504633d16cf60bba735b"},
     {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3a745cc98d504d5bd2c19b10c79c61c7c3df9222629f1b6210c0368177589fb8"},
+    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b590b39ef90c6b22ec0be925b211298e810b4856909c8ca60d27ffbca6c12e6"},
     {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b336b0416828022bfd5a2e3083e7f5ba54b96242159f83c7e3eebaec752f1716"},
+    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:c2faf60c583af0d135e853c86ac2735ce178f0e338a3c7f9ae8f622fd2eb788c"},
     {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:4bc6cb140a7a0ad1f7bc37e018d0ed690b7b6520ade518285dc3171f7a117905"},
+    {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7ff762670cada8e05b32bf1e4dc50b140790909caa8303cfddc4d702b71ea184"},
     {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:57f0a0bbc9868e10ebe874e9f129d2917750adf008fe7b9c1598c0fbbfdde6a6"},
+    {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:a6d2092797b388342c1bc932077ad232f914351932353e2e8706851c870bca1f"},
     {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:60499fe961b21264e17a471ec296dcbf4365fbea611bf9e303ab69db7159ce61"},
     {file = "lxml-5.2.2-cp37-cp37m-win32.whl", hash = "sha256:d9b342c76003c6b9336a80efcc766748a333573abf9350f4094ee46b006ec18f"},
     {file = "lxml-5.2.2-cp37-cp37m-win_amd64.whl", hash = "sha256:b16db2770517b8799c79aa80f4053cd6f8b716f21f8aca962725a9565ce3ee40"},
@@ -4465,6 +4469,22 @@ files = [
 griffe = ">=0.47"
 mkdocstrings = ">=0.25"
 
+[[package]]
+name = "mock"
+version = "5.1.0"
+description = "Rolling backport of unittest.mock for all Pythons"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "mock-5.1.0-py3-none-any.whl", hash = "sha256:18c694e5ae8a208cdb3d2c20a993ca1a7b0efa258c247a1e565150f477f83744"},
+    {file = "mock-5.1.0.tar.gz", hash = "sha256:5e96aad5ccda4718e0a229ed94b2024df75cc2d55575ba5762d31f5767b8767d"},
+]
+
+[package.extras]
+build = ["blurb", "twine", "wheel"]
+docs = ["sphinx"]
+test = ["pytest", "pytest-cov"]
+
 [[package]]
 name = "moto"
 version = "5.0.10"
@@ -5560,6 +5580,7 @@ optional = false
 python-versions = ">=3.9"
 files = [
     {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"},
+    {file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"},
     {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"},
     {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"},
     {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"},
@@ -5580,6 +5601,7 @@ files = [
     {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"},
     {file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"},
     {file = "pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ca6377b8fca51815f382bd0b697a0814c8bda55115678cbc94c30aacbb6eff2"},
+    {file = "pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9057e6aa78a584bc93a13f0a9bf7e753a5e9770a30b4d758b8d5f2a62a9433cd"},
     {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:001910ad31abc7bf06f49dcc903755d2f7f3a9186c0c040b827e522e9cef0863"},
     {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921"},
     {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a77e9d1c386196879aa5eb712e77461aaee433e54c68cf253053a73b7e49c33a"},
@@ -7039,6 +7061,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -7996,6 +8019,27 @@ dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodest
 doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.13.1)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0)", "sphinx-design (>=0.4.0)"]
 test = ["Cython", "array-api-strict", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
 
+[[package]]
+name = "seaborn"
+version = "0.13.2"
+description = "Statistical data visualization"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987"},
+    {file = "seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7"},
+]
+
+[package.dependencies]
+matplotlib = ">=3.4,<3.6.1 || >3.6.1"
+numpy = ">=1.20,<1.24.0 || >1.24.0"
+pandas = ">=1.2"
+
+[package.extras]
+dev = ["flake8", "flit", "mypy", "pandas-stubs", "pre-commit", "pytest", "pytest-cov", "pytest-xdist"]
+docs = ["ipykernel", "nbconvert", "numpydoc", "pydata_sphinx_theme (==0.10.0rc2)", "pyyaml", "sphinx (<6.0.0)", "sphinx-copybutton", "sphinx-design", "sphinx-issues"]
+stats = ["scipy (>=1.7)", "statsmodels (>=0.12)"]
+
 [[package]]
 name = "semantic-router"
 version = "0.0.48"
@@ -10169,4 +10213,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.11,<3.12"
-content-hash = "0f6bc0402cbde8567d6cdd705acee1d3af4bdb2e7e713362877266e89b6533ce"
+content-hash = "0d54ca3b9faad13a7d7c77be4047b29aa8bf07d57acb3a6166e9cdab2cb60421"
diff --git a/pyproject.toml b/pyproject.toml
index c5cc872b1..f7afa4d37 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,8 @@ faststream = {extras = ["redis"], version = "<0.5.0"}
 langchain-elasticsearch = "^0.2.0"
 
 
+seaborn = "^0.13.2"
+mock = "^5.1.0"
 [tool.poetry.group.api.dependencies]
 fastapi = "^0.111.0"
 uvicorn = "^0.30.1"