diff --git a/README.md b/README.md
index e271b56c..7d9eb63a 100644
--- a/README.md
+++ b/README.md
@@ -37,20 +37,26 @@
     <a href="https://pypi.org/project/adalflow/">
         <img alt="PyPI Version" src="https://img.shields.io/pypi/v/adalflow?style=flat-square">
     </a>
-    <a href="https://star-history.com/#SylphAI-Inc/LightRAG">
-        <img alt="GitHub stars" src="https://img.shields.io/github/stars/SylphAI-Inc/LightRAG?style=flat-square">
+    <a href="https://star-history.com/#SylphAI-Inc/AdalFlow">
+        <img alt="GitHub stars" src="https://img.shields.io/github/stars/SylphAI-Inc/AdalFlow?style=flat-square">
     </a>
-    <a href="https://github.com/SylphAI-Inc/LightRAG/issues">
-        <img alt="Open Issues" src="https://img.shields.io/github/issues-raw/SylphAI-Inc/LightRAG?style=flat-square">
+    <a href="https://github.com/SylphAI-Inc/AdalFlow/issues">
+        <img alt="Open Issues" src="https://img.shields.io/github/issues-raw/SylphAI-Inc/AdalFlow?style=flat-square">
     </a>
     <a href="https://opensource.org/license/MIT">
-        <img alt="License" src="https://img.shields.io/github/license/SylphAI-Inc/LightRAG">
+        <img alt="License" src="https://img.shields.io/github/license/SylphAI-Inc/AdalFlow">
     </a>
       <a href="https://discord.gg/ezzszrRZvT">
         <img alt="discord-invite" src="https://dcbadge.vercel.app/api/server/ezzszrRZvT?style=flat">
     </a>
 </p>
 
+<h4>
+<p align="center">
+For AI researchers, product teams, and software engineers who want to learn the AI way.
+</p>
+</h4>
+
 
 
 <!-- <a href="https://colab.research.google.com/drive/1PPxYEBa6eu__LquGoFFJZkhYgWVYE6kh?usp=sharing">
diff --git a/adalflow/CHANGELOG.md b/adalflow/CHANGELOG.md
index f90aa4b0..d14cdc04 100644
--- a/adalflow/CHANGELOG.md
+++ b/adalflow/CHANGELOG.md
@@ -1,10 +1,12 @@
-## [0.2.1] - 2024-09-01
+## [0.2.2] - 2024-09-09
 ### Added
 - `get_cache_path`, instead of print out the cache path all the time, we add a ``get_cache_path`` to get the cache path.
 - Make `huggingface datasets` as an optional dependency.
+- Eval: `G_eval` to evaluate llm applications that have no reference text.
 ### Modified
 - Add `template` to let users pass their own template, but need to have the same arguments as the default template.
-- Added `checkpoint resumt` in the `Trainer.diagnose` to show the newest performance and diagnostics on the checkpoint.
+- Added `checkpoint resume` in the `Trainer.diagnose` to show the newest performance and diagnostics on the checkpoint.
+
 ## [0.2.0] - 2024-08-20
 ### Added
 - Qdrant retriever.
diff --git a/adalflow/adalflow/__init__.py b/adalflow/adalflow/__init__.py
index 0ad0dbe6..a1bb564a 100644
--- a/adalflow/adalflow/__init__.py
+++ b/adalflow/adalflow/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.2.0"
+__version__ = "0.2.2"
 
 from adalflow.core.component import Component, fun_to_component
 from adalflow.core.container import Sequential
@@ -8,7 +8,12 @@
 from adalflow.core.generator import Generator
 
 
-from adalflow.core.types import GeneratorOutput, EmbedderOutput, RetrieverOutput
+from adalflow.core.types import (
+    GeneratorOutput,
+    EmbedderOutput,
+    RetrieverOutput,
+    Document,
+)
 from adalflow.core.model_client import ModelClient
 from adalflow.core.embedder import Embedder
 from adalflow.core.string_parser import (
@@ -91,6 +96,7 @@
     "GeneratorOutput",
     "EmbedderOutput",
     "RetrieverOutput",
+    "Document",
     # Optimizer types
     "Optimizer",
     "DemoOptimizer",
diff --git a/adalflow/adalflow/components/retriever/faiss_retriever.py b/adalflow/adalflow/components/retriever/faiss_retriever.py
index 79ded3ef..d858aaac 100644
--- a/adalflow/adalflow/components/retriever/faiss_retriever.py
+++ b/adalflow/adalflow/components/retriever/faiss_retriever.py
@@ -15,8 +15,6 @@
 import logging
 import os
 
-import faiss
-
 
 from adalflow.core.retriever import Retriever
 from adalflow.core.embedder import Embedder
@@ -31,6 +29,7 @@
 from adalflow.utils.lazy_import import safe_import, OptionalPackages
 
 safe_import(OptionalPackages.FAISS.value[0], OptionalPackages.FAISS.value[1])
+import faiss
 
 log = logging.getLogger(__name__)
 
diff --git a/adalflow/adalflow/core/db.py b/adalflow/adalflow/core/db.py
index afd796a9..5062ded5 100644
--- a/adalflow/adalflow/core/db.py
+++ b/adalflow/adalflow/core/db.py
@@ -9,6 +9,7 @@
 
 from adalflow.core.component import Component
 from adalflow.utils.registry import EntityMapping
+from adalflow.utils.global_config import get_adalflow_default_root_path
 
 
 log = logging.getLogger(__name__)
@@ -18,6 +19,8 @@
 U = TypeVar("U")  # U will be the type after transformation
 
 
+# TODO: localDB does not need to be a component
+# TODO: DB clarity can be further improved
 @dataclass
 class LocalDB(Generic[T], Component):
     __doc__ = r"""LocalDB with in-memory CRUD operations, data transformation/processing pipelines, and persistence.
@@ -109,6 +112,9 @@ class LocalDB(Generic[T], Component):
     mapper_setups: Dict[str, Callable[[T], Any]] = field(
         default_factory=dict, metadata={"description": "Map function setup by key"}
     )
+    index_path: Optional[str] = field(
+        default="index.faiss", metadata={"description": "Path to the index file"}
+    )
 
     def __post_init__(self):
         super().__init__()
@@ -120,9 +126,27 @@ def length(self):
     def get_transformer_keys(self) -> List[str]:
         return list(self.transformed_items.keys())
 
-    def get_transformed_data(self, key: str) -> List[U]:
-        """Get the transformed items by key."""
-        return self.transformed_items[key]
+    # def get_transformed_data(self, key: str) -> List[U]:
+    #     """Get the transformed items by key."""
+    #     return self.transformed_items[key]
+
+    def get_transformed_data(
+        self, key: str, filter_fn: Callable[[Any], bool] = lambda x: True
+    ) -> List[U]:
+        """
+        Get the transformed items by key after applying a filter on metadata.
+
+        Args:
+            key (str): The key to identify which transformed items to retrieve.
+            filter_fn (Callable[[Any], bool], optional): The filter function to apply on the metadata. Defaults to lambda x: True.
+
+        Returns:
+            List[U]: The filtered and transformed items.
+        """
+        if key not in self.transformed_items:
+            raise ValueError(f"Key {key} not found in transformed items.")
+        # Apply filter function on the transformed items
+        return list(filter(filter_fn, self.transformed_items[key]))
 
     def _get_transformer_name(self, transformer: Component) -> str:
         name = f"{transformer.__class__.__name__}_"
@@ -143,6 +167,7 @@ def register_transformer(
         self.transformer_setups[key] = transformer
         if map_fn is not None:
             self.mapper_setups[key] = map_fn
+        self.transformed_items[key] = []
         return key
 
     @overload
@@ -209,9 +234,15 @@ def load(self, items: List[Any]):
         """
         self.items = items
 
-    def extend(self, items: List[Any], apply_transformer: bool = True):
+    def extend(
+        self,
+        items: List[Any],
+        apply_transformer: bool = True,
+    ):
         """Extend the db with new items."""
+
         self.items.extend(items)
+
         if apply_transformer:
             for key, transformer in self.transformer_setups.items():
                 # check if there was a map function registered
@@ -223,8 +254,6 @@ def extend(self, items: List[Any], apply_transformer: bool = True):
                     transformed_items = transformer(items)
                 self.transformed_items[key].extend(transformed_items)
 
-        self.items.extend(items)
-
     def delete(self, index: Optional[int] = None, remove_transformed: bool = True):
         """Remove items by index or pop the last item. Optionally remove the transformed data as well.
 
@@ -293,26 +322,38 @@ def reset(self):
         self.mapper_setups = {}
         self.items = []
 
-    def save_state(self, filepath: str):
+    def save_state(self, filepath: str = None):
         """Save the current state (attributes) of the DB using pickle.
 
         Note:
             The transformer setups will be lost when pickling. As it might not be picklable.
         """
-        filepath = filepath or "storage/local_item_db.pkl"
+        filepath = filepath or os.path.join(
+            get_adalflow_default_root_path,
+            (
+                "local_db/local_item_db.pkl"
+                if not self.name
+                else f"local_db/{self.name}.pkl"
+            ),
+        )
+        self.index_path = filepath
         file_dir = os.path.dirname(filepath)
-        if file_dir and file_dir != "":
+        if not os.path.exists(file_dir):
             os.makedirs(file_dir, exist_ok=True)
 
         with open(filepath, "wb") as file:
             pickle.dump(self, file)
+        print(f"Saved the state of the DB to {filepath}")
 
     @classmethod
     def load_state(cls, filepath: str = None) -> "LocalDB":
         """Load the state of the DB from a pickle file."""
-        filepath = filepath or "storage/local_item_db.pkl"
-        with open(filepath, "rb") as file:
-            return pickle.load(file)
+        filepath = filepath or os.path.join(
+            get_adalflow_default_root_path, "local_db/local_item_db.pkl"
+        )
+        if os.path.exists(filepath):
+            with open(filepath, "rb") as file:
+                return pickle.load(file)
 
     def __getstate__(self):
         """Special handling of the components in pickling."""
diff --git a/adalflow/adalflow/core/generator.py b/adalflow/adalflow/core/generator.py
index 12cd3bd4..81d2cf8f 100644
--- a/adalflow/adalflow/core/generator.py
+++ b/adalflow/adalflow/core/generator.py
@@ -6,7 +6,6 @@
 import json
 
 from typing import Any, Dict, Optional, Union, Callable, Tuple, List
-from copy import deepcopy
 import logging
 
 
@@ -110,11 +109,6 @@ def __init__(
             )
 
         template = template or DEFAULT_LIGHTRAG_SYSTEM_PROMPT
-        try:
-            prompt_kwargs = deepcopy(prompt_kwargs)
-        except Exception as e:
-            log.warning(f"Error copying the prompt_kwargs: {e}")
-            prompt_kwargs = prompt_kwargs
 
         # Cache
         model_str = (
@@ -833,7 +827,17 @@ def __call__(self, *args, **kwargs) -> Union[GeneratorOutputType, Any]:
             return self.call(*args, **kwargs)
 
     def _extra_repr(self) -> str:
+        # Create the string for model_kwargs
         s = f"model_kwargs={self.model_kwargs}, "
+
+        # Create the string for trainable prompt_kwargs
+        prompt_kwargs_repr = [
+            k
+            for k, v in self.prompt_kwargs.items()
+            if isinstance(v, Parameter) and v.requires_opt
+        ]
+
+        s += f"trainable_prompt_kwargs={prompt_kwargs_repr}"
         return s
 
     def to_dict(self) -> Dict[str, Any]:
diff --git a/adalflow/adalflow/eval/__init__.py b/adalflow/adalflow/eval/__init__.py
index d8089332..67de685c 100644
--- a/adalflow/adalflow/eval/__init__.py
+++ b/adalflow/adalflow/eval/__init__.py
@@ -1,12 +1,20 @@
 from .answer_match_acc import AnswerMatchAcc
 from .retriever_recall import RetrieverRecall
-from .retriever_relevance import RetrieverRelevance
 from .llm_as_judge import LLMasJudge, DEFAULT_LLM_EVALUATOR_PROMPT
+from .g_eval import (
+    GEvalJudgeEvaluator,
+    GEvalLLMJudge,
+    GEvalMetric,
+    DEFAULT_G_EVAL_RPROMPT,
+)
 
 __all__ = [
     "AnswerMatchAcc",
     "RetrieverRecall",
-    "RetrieverRelevance",
     "LLMasJudge",
     "DEFAULT_LLM_EVALUATOR_PROMPT",
+    "GEvalJudgeEvaluator",
+    "GEvalLLMJudge",
+    "GEvalMetric",
+    "DEFAULT_G_EVAL_RPROMPT",
 ]
diff --git a/adalflow/adalflow/eval/answer_match_acc.py b/adalflow/adalflow/eval/answer_match_acc.py
index 37f8d5e8..4957cdef 100644
--- a/adalflow/adalflow/eval/answer_match_acc.py
+++ b/adalflow/adalflow/eval/answer_match_acc.py
@@ -1,4 +1,4 @@
-"""This is the metric for answer matching. It compares the predicted answer with the ground truth answer."""
+"""This is the metric for QA generation. It compares the predicted answer with the ground truth answer."""
 
 from typing import List, Literal
 from adalflow.eval.base import BaseEvaluator, EvaluationResult
@@ -68,38 +68,6 @@ def compute_single_item(
         else:
             raise NotImplementedError
 
-    # def compute_single_item(self, pred_answer: object, gt_answer: object) -> float:
-    #     r"""
-    #     Compute the match accuracy of the predicted answer for a single query.
-
-    #     Allow any type of input for pred_answer and gt_answer.
-    #     When evaluating, the input will be converted to string.
-
-    #     Args:
-    #         pred_answer (object): Predicted answer.
-    #         gt_answer (object): Ground truth answer.
-
-    #     Returns:
-    #         float: Match accuracy.
-    #     """
-    #     if isinstance(pred_answer, Parameter):
-    #         pred_answer = pred_answer.data
-    #     if isinstance(gt_answer, Parameter):
-    #         gt_answer = gt_answer.data
-    #     try:
-    #         pred_answer = str(pred_answer).split(" ")
-    #         gt_answer = str(gt_answer).split(" ")
-    #     except Exception as e:
-    #         raise ValueError(
-    #             f"Error converting pred_answer and gt_answer to string: {e}"
-    #         )
-    #     if self.type == "exact_match":
-    #         return 1.0 if pred_answer == gt_answer else 0.0
-    #     elif self.type == "fuzzy_match":
-    #         return 1.0 if gt_answer in pred_answer else 0.0
-    #     else:
-    #         raise NotImplementedError
-
     def compute(
         self, pred_answers: List[str], gt_answers: List[str]
     ) -> EvaluationResult:
diff --git a/adalflow/adalflow/eval/base.py b/adalflow/adalflow/eval/base.py
index 29458ac0..a7454a3a 100644
--- a/adalflow/adalflow/eval/base.py
+++ b/adalflow/adalflow/eval/base.py
@@ -1,6 +1,6 @@
 """Abstract base class for evaluation metrics."""
 
-from typing import Optional, List
+from typing import Optional, List, Any
 
 from dataclasses import dataclass
 
@@ -22,7 +22,8 @@ def compute_single_item(self, *args, **kwargs) -> float:
         """Compute the score for a single item."""
         raise NotImplementedError("Subclasses must implement this method.")
 
-    def compute(self, *args, **kwargs) -> EvaluationResult:
+    # TODO: support multi-threading or async to speed up evaluation
+    def compute(self, *args, **kwargs) -> Any:
         """Evaluate a list of predictions and ground truth values. and return overall score and per-item scores."""
         raise NotImplementedError("Subclasses must implement this method.")
 
diff --git a/adalflow/adalflow/eval/functional.py b/adalflow/adalflow/eval/functional.py
new file mode 100644
index 00000000..aa436bc1
--- /dev/null
+++ b/adalflow/adalflow/eval/functional.py
@@ -0,0 +1,81 @@
+from typing import List, Tuple
+import numpy as np
+
+
+def confidence_interval(
+    judgements: List[float], confidence: float = 0.95
+) -> Tuple[float, float]:
+    """
+    Calculate the confidence interval for a list of binary judgments.
+
+    Args:
+        judgements (List[float]): List of binary judgments (1/0).
+        confidence (float): Confidence level (default 0.95).
+
+    Returns:
+        tuple: Lower and upper bounds of the confidence interval.
+    """
+    # Step 1: Calculate the mean
+    mean_score = np.mean(judgements)
+
+    # Step 2: Calculate the standard error (SE)
+    standard_error = np.std(judgements, ddof=1) / np.sqrt(len(judgements))
+
+    # Step 3: Use the Z-critical value for the confidence interval
+    z_critical = 1.96  # For a 95% CI with a normal distribution
+
+    # Step 4: Calculate the margin of error (MoE)
+    margin_of_error = z_critical * standard_error
+
+    # Step 5: Calculate the confidence interval
+    # Confidence interval (clipped to [0, 1])
+    lower_bound = max(0, mean_score - margin_of_error)
+    upper_bound = min(1, mean_score + margin_of_error)
+
+    return (lower_bound, upper_bound)
+
+
+def longest_common_substring(s1: str, s2: str) -> str:
+    """
+    Find the longest common substring between two strings.
+    """
+    # Create a matrix to store lengths of longest common suffixes of substrings
+    # Initialize all values to 0
+    m, n = len(s1), len(s2)
+    lcs_matrix = [[0] * (n + 1) for _ in range(m + 1)]
+
+    # Variable to store the length of the longest common substring
+    longest_length = 0
+    # Variable to store the ending index of the longest common substring in s1
+    ending_index_s1 = 0
+
+    # Build the matrix in a bottom-up manner
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            if s1[i - 1] == s2[j - 1]:
+                lcs_matrix[i][j] = lcs_matrix[i - 1][j - 1] + 1
+                if lcs_matrix[i][j] > longest_length:
+                    longest_length = lcs_matrix[i][j]
+                    ending_index_s1 = i - 1
+            else:
+                lcs_matrix[i][j] = 0
+
+    # The longest common substring
+    longest_common_substring = s1[
+        ending_index_s1 - longest_length + 1 : ending_index_s1 + 1
+    ]
+
+    return longest_common_substring
+
+
+if __name__ == "__main__":
+    # Example binary judgments (True/False as 1/0)
+    judgements = [1, 1, 0, 1, 0, 1, 1]  # Convert to 1/0
+    score_range = confidence_interval(judgements)
+    print(score_range)
+
+    # Example longest common substring
+    s1 = "abcdfghijk"
+    s2 = "abedfghxyz"
+    cs = longest_common_substring(s1, s2)
+    print(cs)
diff --git a/adalflow/adalflow/eval/g_eval.py b/adalflow/adalflow/eval/g_eval.py
new file mode 100644
index 00000000..e0494ce8
--- /dev/null
+++ b/adalflow/adalflow/eval/g_eval.py
@@ -0,0 +1,260 @@
+"""Implementation of G-Eval: G-eval <https://arxiv.org/abs/2303.08774, https://github.com/nlpyang/geval>
+Instead of getting 1/5 as the score, AdalFlow will use 0.2 as the score, so that we can have a score in range [0, 1] for all metrics."""
+
+from enum import Enum
+from typing import Dict, Any, Optional, List, Tuple
+import logging
+
+
+from adalflow.eval.base import BaseEvaluator
+from adalflow.core.component import Component
+from adalflow.core.model_client import ModelClient
+from adalflow.eval.llm_as_judge import DEFAULT_LLM_EVALUATOR_MODEL_KWARGS
+from adalflow.core.string_parser import FloatParser
+
+log = logging.getLogger(__name__)
+
+
+class GEvalMetric(Enum):
+
+    RELEVANCE = "Relevance"  # range [1, 5]
+    FLUENCY = "Fluency"  # range [1, 3]
+    CONSISTENCY = "Consistency"  # range [1, 5]
+    COHERENCE = "Coherence"  # range [1, 5]
+
+
+all_geval_metrics = [m.value for m in GEvalMetric]
+all_geval_metrics_to_range = {
+    "Relevance": 5.0,
+    "Fluency": 3.0,
+    "Consistency": 5.0,
+    "Coherence": 5.0,
+}
+
+
+class NLGTask(Enum):
+    SUMMARIZATION = {
+        "task_desc_str": r"""You will be given a summary of a text.  Please evaluate the summary based on the following criteria:""",
+        "criteria_relevance": r"""Relevance (1-5) - selection of important content from the source.
+        The summary should include only important information from the source document.
+        Annotators were instructed to penalize summaries which contained redundancies and excess information.""",
+        "steps_relevance": r"""1. Read the summary and the source document carefully.
+        2. Compare the summary to the source document and identify the main points of the article.
+        3. Assess how well the summary covers the main points of the article, and how much irrelevant or redundant information it contains.
+        4. Assign a relevance score from 1 to 5.""",
+        "criteria_fluency": r"""Fluency (1-3): the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure.
+        - 1: Poor. The summary has many errors that make it hard to understand or sound unnatural.
+        - 2: Fair. The summary has some errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.
+        - 3: Good. The summary has few or no errors and is easy to read and follow.
+        """,
+        "steps_fluency": None,
+        "criteria_coherence": r"""Coherence (1-5) - the collective quality of all sentences.
+        We align this dimension with the DUC quality question of structure and coherence whereby "the summary should be well-structured and well-organized.
+        The summary should not just be a heap of related information, but should build from sentence to a coherent body of information about a topic.""",
+        "steps_coherence": r"""1. Read the input text carefully and identify the main topic and key points.
+        2. Read the summary and assess how well it captures the main topic and key points. And if it presents them in a clear and logical order.
+        3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.""",
+        "criteria_consistency": r"""Consistency (1-5) - the factual alignment between the summary and the summarized source.
+        A factually consistent summary contains only statements that are entailed by the source document.
+        Annotators were also asked to penalize summaries that contained hallucinated facts. """,
+        "steps_consistency": r"""1. Read the summary and the source document carefully.
+        2. Identify the main facts and details it presents.
+        3. Read the summary and compare it to the source document to identify any inconsistencies or factual errors that are not supported by the source.
+        4. Assign a score for consistency based on the Evaluation Criteria.""",
+    }
+
+
+DEFAULT_G_EVAL_RPROMPT = r"""
+<START_OF_SYSTEM_PROMPT>
+{# task desc #}
+{{task_desc_str}}
+---------------------
+{# evaluation criteria #}
+Evaluation Criteria:
+{{evaluation_criteria_str}}
+---------------------
+{# evaluation steps #}
+{% if evaluation_steps_str %}
+Evaluation Steps:
+{{evaluation_steps_str}}
+---------------------
+{% endif %}
+{{input_str}}
+{ # evaluation form #}
+Output the score for metric (scores ONLY!): {{metric_name}}
+<END_OF_SYSTEM_PROMPT>
+"""
+
+
+class GEvalLLMJudge(Component):
+    __doc__ = r"""Demonstrate how to use an LLM/Generator to output True or False for a judgement query.
+
+    You can use any of your template to adapt to more tasks and sometimes you can directly ask LLM to output a score in range [0, 1] instead of only True or False.
+
+    A call on the LLM judge equalize to _compute_single_item method.
+
+    Args:
+        model_client (ModelClient): The model client to use for the generator.
+        model_kwargs (Dict[str, Any], optional): The model kwargs to pass to the model client. Defaults to {}. Please refer to :ref:`ModelClient<components-model_client>` for the details on how to set the model_kwargs for your specific model if it is from our library.
+        template (str, optional): The template to use for the LLM evaluator. Defaults to None.
+        use_cache (bool, optional): Whether to use cache for the LLM evaluator. Defaults to True.
+        default_task (NLGTask, optional): The default task to use for the judgement query. Defaults to None.
+   """
+
+    def __init__(
+        self,
+        model_client: Optional[ModelClient] = None,
+        model_kwargs: Optional[Dict[str, Any]] = None,
+        template: Optional[str] = None,
+        use_cache: bool = True,
+        default_task: Optional[NLGTask] = None,
+    ):
+        from adalflow.core.generator import Generator
+
+        super().__init__()
+        self.model_client = model_client
+        if model_client is None:
+            log.info("model_client is None, default to OpenAIClient.")
+            try:
+                from adalflow.components.model_client import OpenAIClient
+            except ImportError:
+                raise ImportError(
+                    "OpenAIClient is not available. Please fix the import error or set your own choice of model_client and model_kwargs."
+                )
+            self.model_client = OpenAIClient()
+        self.model_kwargs = model_kwargs or DEFAULT_LLM_EVALUATOR_MODEL_KWARGS
+        self.template = template or DEFAULT_G_EVAL_RPROMPT
+        self.prompt_kwargs = {k: {} for k in all_geval_metrics}
+        self.default_task = default_task
+        if default_task:
+            # task_name = default_task.name
+            # print(f"task_name: {task_name}")
+            for metric_name in all_geval_metrics:
+                metric_name_lower = metric_name.lower()
+                self.prompt_kwargs[metric_name] = {
+                    "task_desc_str": default_task.value["task_desc_str"],
+                    "evaluation_criteria_str": default_task.value[
+                        f"criteria_{metric_name_lower}"
+                    ],
+                    "evaluation_steps_str": default_task.value.get(
+                        f"steps_{metric_name_lower}"
+                    ),
+                    "metric_name": metric_name,
+                }
+        self.llm_evaluator = Generator(
+            model_client=self.model_client,
+            model_kwargs=self.model_kwargs,
+            template=self.template,
+            use_cache=use_cache,
+            output_processors=FloatParser(),
+        )
+
+    def call(self, input_str: str) -> Dict[str, Any]:
+        r"""
+        Pass the input string with all information to the LLM evaluator and get the judgement.
+
+        Args:
+            input_str (str): The input string with all information.
+
+        Returns:
+            Dict[str, Any]: The judgement result.
+        """
+
+        output = {}
+
+        total_scores = 0
+        num_metrics = 0
+
+        for metric_name in all_geval_metrics:
+            # add input_str to prompt_kwargs
+            self.prompt_kwargs[metric_name]["input_str"] = input_str
+
+            metric_score = self.llm_evaluator(
+                prompt_kwargs=self.prompt_kwargs[metric_name]
+            )
+            # print(
+            #     f"prompt: \n{self.llm_evaluator.get_prompt(**self.prompt_kwargs[metric_name])}"
+            # )
+            output[metric_name] = (
+                metric_score.data / all_geval_metrics_to_range[metric_name]
+                if metric_score and metric_score.data
+                else None
+            )
+            # print(f"metric score: {metric_score}")
+            if output[metric_name]:
+                total_scores += output[metric_name]
+                num_metrics += 1
+        output["overall"] = total_scores / num_metrics if num_metrics else None
+
+        return output
+
+    def _extra_repr(self) -> str:
+        s = f"default_task= {self.default_task}, prompt_kwargs={self.prompt_kwargs}"
+        return s
+
+
+class GEvalJudgeEvaluator(BaseEvaluator):
+    r"""
+    LLM as judge for evaluating the performance of a LLM in form of GEval with 4 main metrics:
+
+    Relevance, Fluency, Consistency, Coherence.
+
+    Args:
+        llm_judge (Component, optional): The LLM evaluator to use. Defaults to GEvalLLMJudge().
+    """
+
+    def __init__(
+        self,
+        llm_judge: Optional[Component] = None,
+    ):
+        super().__init__()
+        self.llm_judge = llm_judge or GEvalLLMJudge()
+
+    def compute_single_item(self, input_str: str) -> Dict[str, Any]:
+        r"""
+        Compute the score for a single item.
+
+        Args:
+            input_str (str): The input string with all information.
+
+        Returns:
+            Dict[str, Any]: The judgement result.
+        """
+        return self.llm_judge(input_str=input_str)
+
+    def compute(
+        self,
+        input_strs: List[str],
+    ) -> Tuple[Dict, List[Dict[str, Any]]]:
+        r"""
+        Get the judgement of the predicted answer for a list of questions.
+
+        Args:
+           input_strs (List[str]): List of input strings.
+        Returns:
+            List[Dict[str, Any]]: The judgement result.
+        """
+        output = []
+        for input_str in input_strs:
+            output.append(self.compute_single_item(input_str))
+
+        # average across different keys
+        final_output = {metric: [] for metric in all_geval_metrics}
+        final_output.update({"overall": []})
+        for data in output:
+            for metric, score in data.items():
+                if score is None:
+                    continue
+                final_output[metric].append(score)
+
+        for metric, scores in final_output.items():
+            if not scores:
+                final_output[metric] = None
+            else:
+                final_output[metric] = sum(scores) / len(scores)
+
+        return final_output, output
+
+    def __str__(self) -> str:
+        s = f"llm_judge={self.llm_evaluator}, prompt_kwargs={self.llm_judge.prompt_kwargs}"
+        return s
diff --git a/adalflow/adalflow/eval/llm_as_judge.py b/adalflow/adalflow/eval/llm_as_judge.py
index 0594830e..241e11f4 100644
--- a/adalflow/adalflow/eval/llm_as_judge.py
+++ b/adalflow/adalflow/eval/llm_as_judge.py
@@ -1,46 +1,62 @@
 """This is the metric to use an LLM as a judge for evaluating the performance of predicted answers."""
 
-from typing import List, Dict, Any, Optional, TYPE_CHECKING, Union, Literal
+from typing import List, Dict, Any, Optional, TYPE_CHECKING, Union, Literal, Tuple
+from dataclasses import dataclass
 import logging
+from itertools import zip_longest
+
 
 if TYPE_CHECKING:
     pass
 from adalflow.core.component import Component
+from adalflow.optim.parameter import Parameter, ParameterType
 from adalflow.core.model_client import ModelClient
+from adalflow.eval.base import BaseEvaluator
+from adalflow.eval.functional import confidence_interval
 
 
 log = logging.getLogger(__name__)
 
 DEFAULT_LLM_EVALUATOR_PROMPT = r"""<START_OF_SYSTEM_PROMPT>
 {# task desc #}
-You are an evaluator. Given the question, ground truth answer, and predicted answer,
-{# judgement question #}
-{{judgement_str}}
+{{task_desc_str}}
+{# examples #}
+{% if examples_str %}
+{{examples_str}}
+{% endif %}
 <END_OF_SYSTEM_PROMPT>
 ---------------------
 <START_OF_USER>
 {# question #}
+{% if question_str is defined %}
 Question: {{question_str}}
+{% endif %}
 {# ground truth answer #}
+{% if gt_answer_str is defined %}
 Ground truth answer: {{gt_answer_str}}
+{% endif %}
 {# predicted answer #}
 Predicted answer: {{pred_answer_str}}
-{# assistant response #}
 <END_OF_USER>
 """
 
 DEFAULT_JUDGEMENT_QUERY = "Does the predicted answer contain the ground truth answer? Say True if yes, False if no."
 
 
-# print(f"globals: {globals()}")
-
 DEFAULT_LLM_EVALUATOR_MODEL_KWARGS = {
     "model": "gpt-3.5-turbo",
-    "temperature": 0.3,
+    "temperature": 1,
     "stream": False,
 }
 
 
+@dataclass
+class LLMJudgeEvalResult:
+    avg_score: float
+    judgement_score_list: List[bool]
+    confidence_interval: Tuple[float, float]
+
+
 class DefaultLLMJudge(Component):
     __doc__ = r"""Demonstrate how to use an LLM/Generator to output True or False for a judgement query.
 
@@ -51,6 +67,13 @@ class DefaultLLMJudge(Component):
     Args:
         model_client (ModelClient): The model client to use for the generator.
         model_kwargs (Dict[str, Any], optional): The model kwargs to pass to the model client. Defaults to {}. Please refer to :ref:`ModelClient<components-model_client>` for the details on how to set the model_kwargs for your specific model if it is from our library.
+        template (str, optional): The template to use for the LLM evaluator. Defaults to None.
+        jugement_query (str, optional): The judgement query string. Defaults to DEFAULT_JUDGEMENT_QUERY.
+        output_type (Literal["bool", "float"], optional): The output type of the judgement. Defaults to "bool".
+        use_cache (bool, optional): Whether to use cache for the LLM evaluator. Defaults to True.
+
+    Note:
+        Must use True/False instead of Yes/No in the judgement_query for response.
     """
 
     def __init__(
@@ -59,6 +82,7 @@ def __init__(
         model_kwargs: Optional[Dict[str, Any]] = None,
         template: Optional[str] = None,
         jugement_query: Optional[str] = None,
+        example_str: Optional[str] = None,
         output_type: Literal["bool", "float"] = "bool",
         use_cache: bool = True,
     ):
@@ -77,13 +101,23 @@ def __init__(
             self.model_client = OpenAIClient()
         self.model_kwargs = model_kwargs or DEFAULT_LLM_EVALUATOR_MODEL_KWARGS
         self.template = template or DEFAULT_LLM_EVALUATOR_PROMPT
+        self._jugement_query = jugement_query or DEFAULT_JUDGEMENT_QUERY
         self.llm_evaluator = Generator(
             model_client=self.model_client,
             model_kwargs=self.model_kwargs,
             template=self.template,
             use_cache=use_cache,
+            prompt_kwargs={
+                "task_desc_str": Parameter(
+                    data=f"""You are an evaluator. Given the question(optional), ground truth answer(optional), and predicted answer, {self._jugement_query}""",
+                    param_type=ParameterType.PROMPT,
+                ),
+                "examples_str": Parameter(
+                    data=example_str, param_type=ParameterType.DEMOS
+                ),
+            },
         )
-        self._jugement_query = jugement_query or DEFAULT_JUDGEMENT_QUERY
+
         self.output_type = output_type
 
     def call(
@@ -91,7 +125,6 @@ def call(
         question: str,
         gt_answer: str,
         pred_answer: str,
-        judgement_query: Optional[str] = None,
     ) -> Union[bool, float]:
         r"""
         Get the judgement of the predicted answer for a single question.
@@ -105,13 +138,11 @@ def call(
         Returns:
             bool: Judgement result.
         """
-        judgement_query = judgement_query or self._jugement_query
         output = self.llm_evaluator(
             prompt_kwargs={
                 "question_str": question,
                 "gt_answer_str": gt_answer,
                 "pred_answer_str": pred_answer,
-                "judgement_str": judgement_query,
             }
         )
 
@@ -127,8 +158,12 @@ def call(
             # raise ValueError(f"Invalid judgement: {judgement}")
         return output
 
+    def _extra_repr(self) -> str:
+        s = f"judgement_query= {self._jugement_query}, "
+        return s
 
-class LLMasJudge:
+
+class LLMasJudge(BaseEvaluator):
     r"""
     LLM as judge for evaluating the performance of a LLM.
 
@@ -152,21 +187,29 @@ class LLMasJudge:
         2 / 3
         >>> judgement_list
         [True, True, False]
+
+    Customize the LLMJudge
+
+    .. code-block:: python
+
+        llm_judge = Def
     """
 
     def __init__(
         self,
-        llm_evaluator: Optional[Component] = None,
+        llm_judge: Optional[Component] = None,
     ):
-        self.llm_evaluator = llm_evaluator or DefaultLLMJudge()
+        super().__init__()
+        self.llm_judge = llm_judge or DefaultLLMJudge()
 
     def compute(
         self,
-        questions: List[str],
-        gt_answers: List[str],
+        *,
         pred_answers: List[str],
-        judgement_query: str,
-    ) -> List[bool]:
+        questions: Optional[List[str]] = None,
+        gt_answers: Optional[List[str]] = None,
+        # judgement_query: Optional[str] = None,
+    ) -> LLMJudgeEvalResult:
         r"""
         Get the judgement of the predicted answer for a list of questions.
 
@@ -177,37 +220,29 @@ def compute(
             judgement_query (str): Judgement query string.
 
         Returns:
-            tuple:
-                - float: Average judgement score.
-                - List[bool]: Judgement results for each query.
+            LLMEvalResult: The evaluation result.
+
         """
         judgement_list = []
-        for question, gt_answer, pred_answer in zip(
-            questions, gt_answers, pred_answers
+        questions = questions or [None] * len(pred_answers)
+        gt_answers = gt_answers or [None] * len(pred_answers)
+
+        for question, gt_answer, pred_answer in zip_longest(
+            questions, gt_answers, pred_answers, fillvalue=None
         ):
-            judgement = self.llm_evaluator(
-                question, gt_answer, pred_answer, judgement_query
+            judgement = self.llm_judge(
+                question,
+                gt_answer,
+                pred_answer,
             )
             judgement_list.append(judgement)
 
-        return judgement_list.count(True) / len(judgement_list), judgement_list
+        avg_score = judgement_list.count(True) / len(judgement_list)
 
+        judgement_score_list = [1 if judgement else 0 for judgement in judgement_list]
+        confidence = confidence_interval(judgement_score_list)
+        return LLMJudgeEvalResult(avg_score, judgement_score_list, confidence)
 
-if __name__ == "__main__":
-
-    questions = [
-        "Is Beijing in China?",
-        "Is Apple founded before Google?",
-        "Is earth flat?",
-    ]
-    pred_answers = ["Yes", "Yes, Appled is founded before Google", "Yes"]
-    gt_answers = ["Yes", "Yes", "No"]
-    judgement_query = (
-        "For the question, does the predicted answer contain the ground truth answer?"
-    )
-    llm_judge = LLMasJudge()
-    avg_judgement, judgement_list = llm_judge.compute(
-        questions, gt_answers, pred_answers, judgement_query
-    )
-    print(avg_judgement)
-    print(judgement_list)
+    def __str__(self) -> str:
+        s = f"llm_judge={self.llm_judge}"
+        return s
diff --git a/adalflow/adalflow/eval/retriever_recall.py b/adalflow/adalflow/eval/retriever_recall.py
index d42d10c5..9abe6d52 100644
--- a/adalflow/adalflow/eval/retriever_recall.py
+++ b/adalflow/adalflow/eval/retriever_recall.py
@@ -1,15 +1,24 @@
-"""This is the metric to evaluate the recall of the retriever."""
+"""Retriever Recall @k metric."""
 
-from typing import List, Union, Tuple
+from typing import List, Union
 
+from adalflow.eval.base import BaseEvaluator, EvaluationResult
 
-class RetrieverRecall:
-    r"""
-    Metric to evaluate the recall of the retriever. The recall is the ratio of the number of relevant context strings in the retrieved context to the total number of ground truth relevant context strings.
+
+class RetrieverRecall(BaseEvaluator):
+    __doc__ = r"""Recall@k measures the ratio of the number of relevant context strings in the top-k retrieved context to the total number of ground truth relevant context strings.
+
+    In our implementation, we use exact string matching between each gt context and the joined retrieved context string.
+    You can use the longest common subsequence (LCS) or other similarity metrics(or embedding based) to decide if it is a match or not.
+
+    If you do not even have the ground truth context, but only grounth truth answers, you can consider using
+    RAGAS framework for now. It computes the recall as:
+
+    Recall = [GT statements that can be attributed to the retrieved context] / [GT statements]
 
     Examples:
         >>> all_retrieved_context = [
-        "Apple is founded before Google.",
+        ["Apple is founded before Google.",
         "Feburary has 28 days in common years. Feburary has 29 days in leap years. Feburary is the second month of the year.",
         ]
         >>> all_gt_context = [
@@ -26,10 +35,13 @@ class RetrieverRecall:
         2 / 3
         >>> recall_list
         [1 / 3, 1.0]
+
+    References:
+        - RAGAS: https://docs.ragas.io/en/stable/concepts/metrics/context_recall.html
     """
 
     def __init__(self):
-        pass
+        super().__init__()
 
     def _compute_single_item(
         self, retrieved_context: str, gt_context: Union[str, List[str]]
@@ -54,23 +66,33 @@ def _compute_single_item(
 
     def compute(
         self,
-        retrieved_contexts: List[str],
-        gt_contexts: Union[List[str], List[List[str]]],
-    ) -> Tuple[float, List[float]]:
+        retrieved_contexts: Union[List[str], List[List[str]]],
+        gt_contexts: List[List[str]],
+    ) -> EvaluationResult:
         r"""
         Compute the recall of the retrieved context for a list of queries.
         Args:
-            retrieved_contexts (List[str]): List of retrieved context strings.
-            gt_contexts (Union[List[str], List[List[str]]]: List of ground truth context strings and each of them can be a string or a list of strings.
+            retrieved_contexts (Union[List[str], List[List[str]]): List of retrieved context strings. Using List[str] we assume you have joined all the context sentences into one string.
+            gt_contexts ( List[List[str]]): List of ground truth context strings.
 
         Returns:
             tuple:
                 - float: Average recall value.
                 - List[float]: Recall values for each query.
         """
+        if len(retrieved_contexts) != len(gt_contexts):
+            raise ValueError(
+                "The number of retrieved context lists and ground truth context lists should be the same."
+            )
+        k = len(retrieved_contexts)
         recall_list = []
         for retrieved_context, gt_context in zip(retrieved_contexts, gt_contexts):
+            if isinstance(retrieved_context, list):
+                retrieved_context = " ".join(retrieved_context)
             recall = self._compute_single_item(retrieved_context, gt_context)
             recall_list.append(recall)
 
-        return sum(recall_list) / len(recall_list), recall_list
+        avg_score = sum(recall_list) / len(recall_list)
+        return EvaluationResult(
+            avg_score, recall_list, additional_info={"type": f"RetrieverRecall@{k}"}
+        )
diff --git a/adalflow/adalflow/eval/retriever_relevance.py b/adalflow/adalflow/eval/retriever_relevance.py
deleted file mode 100644
index a7328482..00000000
--- a/adalflow/adalflow/eval/retriever_relevance.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""This is the metric for evaluating the relevance of the retrieved context."""
-
-from typing import List, Union, Tuple
-from adalflow.core.tokenizer import Tokenizer
-
-
-class RetrieverRelevance:
-    r"""
-    Metric for evaluating the relevance of the retrieved context. The context relevance is the ratio of the number of relevant context tokens in the retrieved context to the total number of tokens in the retrieved context.
-
-    Examples:
-        >>> retrieved_contexts = [
-        "Apple is founded before Google.",
-        "Feburary has 28 days in common years. Feburary has 29 days in leap years. Feburary is the second month of the year.",
-        ]
-        >>> gt_contexts = [
-            [
-                "Apple is founded in 1976.",
-                "Google is founded in 1998.",
-                "Apple is founded before Google.",
-            ],
-            ["Feburary has 28 days in common years", "Feburary has 29 days in leap years"],
-        ]
-        >>> retriever_relevance = RetrieverRelevance()
-        >>> avg_relevance, relevance_list = retriever_relevance.compute(all_retrieved_context, all_gt_context)
-        >>> avg_relevance
-        0.803030303030303
-        >>> relevance_list
-        [1.0, 0.6060606060606061]
-
-    """
-
-    def __init__(self):
-        pass
-
-    def _compute_single_item(
-        self, retrieved_context: str, gt_context: Union[str, List[str]]
-    ) -> float:
-        r"""
-        Compute the context relevance of the retrieved context for a single query. The context relevance is the ratio of the number of relevant context tokens in the retrieved context to the total number of tokens in the retrieved context.
-
-        Args:
-            retrieved_context (str): Retrieved context string.
-            gt_context (Union[str, List[str]]): Context string or list of context strings to compare against.
-
-        Returns:
-            float: Context relevance value.
-        """
-        if isinstance(gt_context, str):
-            gt_context = [gt_context]
-        relevant_tokens = 0
-        tokenizer = Tokenizer()
-        for gt_context_sentence in gt_context:
-            if gt_context_sentence in retrieved_context:
-                relevant_tokens += tokenizer.count_tokens(gt_context_sentence)
-        return relevant_tokens / tokenizer.count_tokens(retrieved_context)
-
-    def compute(
-        self,
-        retrieved_contexts: List[str],
-        gt_contexts: Union[List[str], List[List[str]]],
-    ) -> Tuple[float, List[float]]:
-        r"""
-        Compute the context relevance of the retrieved context for a list of queries.
-
-        Args:
-            retrieved_contexts (List[str]): List of retrieved context strings.
-            gt_contexts (Union[List[str], List[List[str]]]): List of ground truth context strings and each of them can be a string or a list of strings.
-
-        Returns:
-            tuple:
-                - float: Average context relevance value.
-                - List[float]: Context relevance values for each query.
-        """
-        context_relevance_list = []
-        for retrieved_context, gt_context in zip(retrieved_contexts, gt_contexts):
-            context_relevance = self._compute_single_item(retrieved_context, gt_context)
-            context_relevance_list.append(context_relevance)
-
-        return (
-            sum(context_relevance_list) / len(context_relevance_list),
-            context_relevance_list,
-        )
diff --git a/adalflow/adalflow/icl/README.md b/adalflow/adalflow/icl/README.md
deleted file mode 100644
index ed28807c..00000000
--- a/adalflow/adalflow/icl/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-ICL with few-shots or many-shots if you have a large-context LLM is a must when we bootstrap any ML tasks or to compare with model finetune performances.
-
-When ICL is used for classical ML like classification, if we have the logits of tokens, we can use `constrainded decoding` to
diff --git a/adalflow/adalflow/icl/retrieval_icl.py b/adalflow/adalflow/icl/retrieval_icl.py
deleted file mode 100644
index 3bbcc04d..00000000
--- a/adalflow/adalflow/icl/retrieval_icl.py
+++ /dev/null
@@ -1,57 +0,0 @@
-from typing import List
-
-from adalflow.core.types import Document
-from adalflow.core.retriever import Retriever
-from adalflow.core.embedder import Embedder
-from adalflow.components.data_process import (
-    RetrieverOutputToContextStr,
-    ToEmbeddings,
-    DocumentSplitter,
-)
-from adalflow.core.db import LocalDB
-from adalflow.core.component import Component, Sequential
-
-
-class RetrievalICL(Component):
-    def __init__(
-        self,
-        retriever: Retriever,
-        retriever_output_processors: RetrieverOutputToContextStr,
-        text_splitter: DocumentSplitter,
-        vectorizer: Embedder,
-        db: LocalDB,
-    ):
-        super().__init__()
-        self.retriever = retriever
-        self.retriever_output_processors = retriever_output_processors
-
-        self.text_splitter = text_splitter
-        self.vectorizer = vectorizer
-        self.data_transformer = Sequential(
-            self.text_splitter,
-            ToEmbeddings(
-                embedder=self.vectorizer,
-            ),
-        )
-        self.data_transformer_key = self.data_transformer._get_name()
-        self.db = db
-
-    def build_index(self, documents: List[Document]):
-        self.db.load_documents(documents)
-        self.map_key = self.db.map_data()
-        print(f"map_key: {self.map_key}")
-        self.data_key = self.db.transform_data(self.data_transformer)
-        print(f"data_key: {self.data_key}")
-        self.transformed_documents = self.db.get_transformed_data(self.data_key)
-        self.retriever.build_index_from_documents(self.transformed_documents)
-
-    def call(self, query: str, top_k: int) -> str:
-        retrieved_documents = self.retriever(query, top_k)
-        # fill in the document
-        for i, retriever_output in enumerate(retrieved_documents):
-            retrieved_documents[i].documents = [
-                self.transformed_documents[doc_index]
-                for doc_index in retriever_output.doc_indexes
-            ]
-        example_str = self.retriever_output_processors(retrieved_documents)
-        return example_str
diff --git a/adalflow/adalflow/optim/text_grad/backend_engine_prompt.py b/adalflow/adalflow/optim/text_grad/backend_engine_prompt.py
index a0b25067..e3b60862 100644
--- a/adalflow/adalflow/optim/text_grad/backend_engine_prompt.py
+++ b/adalflow/adalflow/optim/text_grad/backend_engine_prompt.py
@@ -1,6 +1,6 @@
 """Meta-prompts for the backward engine.
 
-Created for Textual Auto-diff and enhanced with peer variables.
+Optimized from Textual Auto-diff and enhanced with peer variables.
 
 Reference: TextGrad: Automatic “Differentiation” via Text."""
 
diff --git a/adalflow/adalflow/utils/lazy_import.py b/adalflow/adalflow/utils/lazy_import.py
index 9a2b0c1c..80a13e98 100644
--- a/adalflow/adalflow/utils/lazy_import.py
+++ b/adalflow/adalflow/utils/lazy_import.py
@@ -33,7 +33,10 @@ class OptionalPackages(Enum):
     TORCH = ("torch", "Please install torch with: pip install torch")
 
     # search library
-    FAISS = ("faiss", "Please install faiss with: pip install faiss")
+    FAISS = (
+        "faiss",
+        "Please install faiss with: pip install faiss-cpu (or faiss if you use GPU)",
+    )
 
     # db library
     SQLALCHEMY = (
diff --git a/adalflow/pyproject.toml b/adalflow/pyproject.toml
index 03171615..a928622a 100644
--- a/adalflow/pyproject.toml
+++ b/adalflow/pyproject.toml
@@ -1,11 +1,11 @@
 [tool.poetry]
 name = "adalflow"
 
-version = "0.2.0"
+version = "0.2.2"
 description = "The Library to Build and Auto-optimize Any LLM Task Pipeline"
 authors = ["Li Yin <li@sylphai.com>"]
 readme = "README.md"
-repository = "https://github.com/SylphAI-Inc/LightRAG"
+repository = "https://github.com/SylphAI-Inc/AdalFlow"
 
 license = "MIT"
 maintainers = ["Li Yin <li@sylphai.com>"]
diff --git a/adalflow/tests/test_evaluators.py b/adalflow/tests/test_evaluators.py
index cb0c0301..a62eee10 100644
--- a/adalflow/tests/test_evaluators.py
+++ b/adalflow/tests/test_evaluators.py
@@ -4,7 +4,6 @@
 from adalflow.eval import (
     AnswerMatchAcc,
     RetrieverRecall,
-    RetrieverRelevance,
     LLMasJudge,
 )
 
@@ -38,31 +37,32 @@ def test_retriever_recall():
         ["Feburary has 28 days in common years", "Feburary has 29 days in leap years"],
     ]
     retriever_recall = RetrieverRecall()
-    avg_recall, recall_list = retriever_recall.compute(retrieved_contexts, gt_contexts)
+    eval_rslt = retriever_recall.compute(retrieved_contexts, gt_contexts)
+    avg_recall, recall_list = eval_rslt.avg_score, eval_rslt.per_item_scores
     assert avg_recall == 2 / 3
     assert recall_list == [1 / 3, 1.0]
 
 
-def test_retriever_relevance():
-    retrieved_contexts = [
-        "Apple is founded before Google.",
-        "Feburary has 28 days in common years. Feburary has 29 days in leap years. Feburary is the second month of the year.",
-    ]
-    gt_contexts = [
-        [
-            "Apple is founded in 1976.",
-            "Google is founded in 1998.",
-            "Apple is founded before Google.",
-        ],
-        ["Feburary has 28 days in common years", "Feburary has 29 days in leap years"],
-    ]
-    retriever_relevance = RetrieverRelevance()
-    avg_relevance, relevance_list = retriever_relevance.compute(
-        retrieved_contexts, gt_contexts
-    )
-    assert 0.8 < avg_relevance < 0.81
-    assert relevance_list[0] == 1.0
-    assert 0.6 < relevance_list[1] < 0.61
+# def test_retriever_relevance():
+#     retrieved_contexts = [
+#         "Apple is founded before Google.",
+#         "Feburary has 28 days in common years. Feburary has 29 days in leap years. Feburary is the second month of the year.",
+#     ]
+#     gt_contexts = [
+#         [
+#             "Apple is founded in 1976.",
+#             "Google is founded in 1998.",
+#             "Apple is founded before Google.",
+#         ],
+#         ["Feburary has 28 days in common years", "Feburary has 29 days in leap years"],
+#     ]
+#     retriever_relevance = RetrieverRelevance()
+#     avg_relevance, relevance_list = retriever_relevance.compute(
+#         retrieved_contexts, gt_contexts
+#     )
+#     assert 0.8 < avg_relevance < 0.81
+#     assert relevance_list[0] == 1.0
+#     assert 0.6 < relevance_list[1] < 0.61
 
 
 # This test is skipped by default. To run this test locally, set the environment variable RUN_LOCAL_TESTS to True (export RUN_LOCAL_TESTS=true).
diff --git a/docs/source/_static/images/G_eval_structure.png b/docs/source/_static/images/G_eval_structure.png
new file mode 100644
index 00000000..f137a504
Binary files /dev/null and b/docs/source/_static/images/G_eval_structure.png differ
diff --git a/docs/source/apis/eval/index.rst b/docs/source/apis/eval/index.rst
index e706263f..b1eab007 100644
--- a/docs/source/apis/eval/index.rst
+++ b/docs/source/apis/eval/index.rst
@@ -12,8 +12,8 @@ Overview
 
    eval.answer_match_acc
    eval.retriever_recall
-   eval.retriever_relevance
    eval.llm_as_judge
+   eval.g_eval
 
 
 .. toctree::
@@ -23,8 +23,8 @@ Overview
    eval.base
    eval.answer_match_acc
    eval.retriever_recall
-   eval.retriever_relevance
    eval.llm_as_judge
+   eval.g_eval
 
 
 
diff --git a/docs/source/apis/index.rst b/docs/source/apis/index.rst
index 510e1b4c..13d19ff9 100644
--- a/docs/source/apis/index.rst
+++ b/docs/source/apis/index.rst
@@ -88,8 +88,8 @@ Evaluation
    eval.base
    eval.answer_match_acc
    eval.retriever_recall
-   eval.retriever_relevance
    eval.llm_as_judge
+   eval.g_eval
 
 
 Optimization
diff --git a/docs/source/tutorials/db.rst b/docs/source/tutorials/db.rst
index edfc15bf..e3115b22 100644
--- a/docs/source/tutorials/db.rst
+++ b/docs/source/tutorials/db.rst
@@ -11,7 +11,7 @@
 ..    </div>
 
 
-Data (Database/Pipeline) & RAG
+Data (Database/Pipeline)
 ================================
 
 .. .. admonition:: Author
@@ -20,7 +20,7 @@ Data (Database/Pipeline) & RAG
 ..    `Li Yin <https://github.com/liyin2015>`_
 
 
-The purpose of this note is to provide an overview on data, data modeling, and data storage in LLM applications along with how LightRAG works with data.
+The purpose of this note is to provide an overview on data, data modeling, and data storage in LLM applications along with how AdalFlow works with data.
 We will conver:
 
 * Data models on how to represent important data.
@@ -33,7 +33,7 @@ We will conver:
     Datasets, including the input and ground truth output loading and dataset will be covered in the optimizing section.
 
 So far, we have seen how our core components like ``Generator``, ``Embedder``, and ``Retriever`` work without any data cache/database and enforced data format to read data from and to write data to.
-However, in real-world LLM applications, we can not avoid to deal with data storage:
+However, in real-world LLM applications, we can not avoid dealing with data storage.
 
 1. Our documents to retrieve context from can be large and be stored in a file system or in a database in forms of tables or graphs.
 2. We often need to pre-process a large amount of data (like text splitting and embedding and idf in BM25) in a datapipline into a cloud database.
@@ -182,7 +182,7 @@ Here is the code to form a data pipeline:
 
     from adalflow.core.embedder import Embedder
     from adalflow.core.types import ModelClientType
-    from adalflow.components.data_process import DocumentSplitter, ToEmbeddings
+    from adalflow.components.data_process import TextSplitter, ToEmbeddings
     from adalflow.core.component import Sequential
 
 
@@ -198,7 +198,7 @@ Here is the code to form a data pipeline:
         "split_overlap": 10
     }
 
-    splitter = DocumentSplitter(**splitter_config)
+    splitter = TextSplitter(**splitter_config)
     embedder = Embedder(model_client =ModelClientType.OPENAI(), model_kwargs=model_kwargs)
     embedder_transformer = ToEmbeddings(embedder, batch_size=2)
     data_transformer = Sequential(splitter, embedder_transformer)
@@ -209,7 +209,7 @@ The printout will be:
 .. code-block::
 
     Sequential(
-    (0): DocumentSplitter(split_by=word, split_length=50, split_overlap=10)
+    (0): TextSplitter(split_by=word, split_length=50, split_overlap=10)
     (1): ToEmbeddings(
         batch_size=2
         (embedder): Embedder(
@@ -360,7 +360,7 @@ The printout will be:
     Adding embeddings to documents from batch: 2it [00:00, 63072.24it/s]
     [Document(id=64987b2b-b6c6-4eb4-9122-02448e3fd394, text='What are the benefits of renewable energy? I can see you are interested in renewable energy. Renewab...', meta_data=None, vector='len: 256', parent_doc_id=f2eddc77-4667-43f5-87e0-fd11f12958b3, order=0, score=None), Document(id=9a424d4c-4bd0-48ce-aba9-7a4f86892556, text='and installation sectors. The growth in renewable energy usage boosts local economies through increa...', meta_data=None, vector='len: 256', parent_doc_id=f2eddc77-4667-43f5-87e0-fd11f12958b3, order=1, score=None), Document(id=45efa517-8e52-4780-bdbd-2329ffa8d4b6, text='How do solar panels impact the environment? Solar panels convert sunlight into electricity by allowi...', meta_data=None, vector='len: 256', parent_doc_id=b2dbdf2f-f513-493d-aaa8-c77c98ac260f, order=0, score=None), Document(id=bc0ff7f6-27cc-4e24-8c3e-9435ed755e20, text='has been found to have a significant positive effect on the environment by reducing the reliance on ...', meta_data=None, vector='len: 256', parent_doc_id=b2dbdf2f-f513-493d-aaa8-c77c98ac260f, order=1, score=None)]
     Sequential(
-    (0): DocumentSplitter(split_by=word, split_length=50, split_overlap=10)
+    (0): TextSplitter(split_by=word, split_length=50, split_overlap=10)
     (1): ToEmbeddings(
         batch_size=2
         (embedder): Embedder(
@@ -520,15 +520,16 @@ Now, we can use the ``fetched_dialog_turns`` to continue the conversation with t
 
 Cloud database
 --------------------
+Please check out the :ref:`Retriever<tutorials-retriever>` for using Cloud database as a storage and a retriever.
 
-Suggestion on File reading and writing
+File Reading
 ------------------------------------------
-We dont provide integration on using ``fsspec``, but here we can give you some suggestions on how to use it.
+We dont provide integration currently, but using file reading packages like `fsspec <https://filesystem-spec.readthedocs.io/en/latest/>`_ should be fairly easy to use with our data processing pipeline.
 
 
 
-Graph database
---------------------
+.. Graph database
+.. --------------------
 
 
 .. admonition:: API References
@@ -537,5 +538,10 @@ Graph database
    - :class:`core.types.Document`
    - :class:`core.types.DialogTurn`
    - :class:`core.db.LocalDB`
-   - :class:`components.data_process.DocumentSplitter`
+   - :class:`components.data_process.text_splitter.TextSplitter`
    - :class:`components.data_process.ToEmbeddings`
+
+
+.. admonition:: Additional Resources
+
+   - `fsspec <https://filesystem-spec.readthedocs.io/en/latest/>`_
diff --git a/docs/source/tutorials/evaluation.rst b/docs/source/tutorials/evaluation.rst
index 0c10a752..1f14bf84 100644
--- a/docs/source/tutorials/evaluation.rst
+++ b/docs/source/tutorials/evaluation.rst
@@ -1,3 +1,22 @@
+
+.. _tutorials-llm-evaluation:
+
+
+
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: flex-start; align-items: center; margin-bottom: 20px;">
+
+        <a href="https://colab.research.google.com/github/SylphAI-Inc/AdalFlow/blob/main/notebooks/evaluation/adalflow_llm_eval.ipynb" target="_blank" style="margin-right: 10px;">
+            <img alt="Try Quickstart in Colab" src="https://colab.research.google.com/assets/colab-badge.svg" style="vertical-align: middle;">
+        </a>
+      <a href="https://github.com/SylphAI-Inc/AdalFlow/blob/main/tutorials/evaluation" target="_blank" style="display: flex; align-items: center;">
+         <img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" style="height: 20px; width: 20px; margin-right: 5px;">
+         <span style="vertical-align: middle;"> Open Source Code</span>
+      </a>
+   </div>
+
 LLM Evaluation
 ====================================
 
@@ -6,17 +25,25 @@ LLM Evaluation
 
 ..    `Meng Liu <https://github.com/mengliu1998>`_
 
-"You cannot improve what you cannot measure". This is especially true in the context of LLMs, which have become increasingly popular due to their impressive performance on a wide range of tasks. Evaluating LLMs and their applications is crucial in both research and production to understand their capabilities and limitations.
-Overall, such evaluation is a complex and multifaceted process. Below, we provide a guideline for evaluating LLMs and their applications, incorporating aspects outlined by *Chang et al.* [1]_:
+"You cannot optimize what you cannot measure".
+
+This is especially true in the context of LLMs, which have become increasingly popular due to their impressive performance on a wide range of tasks.
+Evaluating LLMs and their applications is crucial in both research and production to understand their capabilities and limitations.
+Overall, such evaluation is a complex and multifaceted process.
+Below, we provide a guideline for evaluating LLMs and their applications, incorporating aspects outlined by *Chang et al.* [1]_ and more for RAG evaluation.
 
 * **What to evaluate**: the tasks and capabilities that LLMs are evaluated on.
 * **Where to evaluate**: the datasets and benchmarks that are used for evaluation.
 * **How to evaluate**: the protocols and metrics that are used for evaluation.
 
 
-Tasks and Capabilities to Evaluate
+Tasks and Capabilities
 ------------------------------------------
-When we are considering the LLM evaluation, the first question that arises is what to evaluate. Deciding what tasks to evaluate or which capabilities to assess is crucial, as it influences both the selection of appropriate benchmarks (where to evaluate) and the choice of evaluation methods (how to evaluate). Below are some commonly evaluated tasks and capabilities of LLMs:
+
+.. When we are considering the LLM evaluation, the first question that arises is what to evaluate.
+.. Deciding what tasks to evaluate or which capabilities to assess is crucial, as it influences both the selection of appropriate benchmarks (where to evaluate) and the choice of evaluation methods (how to evaluate).
+
+Below are some commonly evaluated tasks and capabilities of LLMs summarized in [1]_.
 
 * *Natural language understanding* (NLU) tasks, such as text classification and sentiment analysis, which evaluate the LLM's ability to understand natural language.
 * *Natural language generation* (NLG) tasks, such as text summarization, translation, and question answering, which evaluate the LLM's ability to generate natural language.
@@ -27,10 +54,11 @@ When we are considering the LLM evaluation, the first question that arises is wh
 * *Agent applications*, which evaluate the LLM's ability to use external tools and APIs to perform tasks, such as web search.
 
 For a more detailed and comprehensive description of the tasks and capabilities that LLMs are evaluated on, please refer to the review papers by *Chang et al.* [1]_ and *Guo et al.* [2]_.
+RRAG [21]_ evaluation differs as it introduces a retrieval component to the pipeline, which we will discuss in the next section.
 
 Datasets and Benchmarks
 ------------------------------------------
-Once we have decided what to evaluate, the next question is where to evaluate. The selection of datasets and benchmarks is important, as it determines the quality and relevance of the evaluation.
+The selection of datasets and benchmarks [19]_ is important, as it determines the quality and relevance of the evaluation.
 
 To comprehensively assess the capabilities of LLMs, researchers typically utilize benchmarks and datasets that span a broad spectrum of tasks. For example, in the GPT-4 technical report [3]_, the authors employed a variety of general language benchmarks, such as MMLU [4]_, and academic exams, such as the SAT, GRE, and AP courses, to evaluate the diverse capabilities of GPT-4. Below are some commonly used datasets and benchmarks for evaluating LLMs.
 
@@ -57,63 +85,422 @@ The output will be a Dataset object containing the test set of the MMLU dataset.
         num_rows: 100
     })
 
+**Datasets for RAG Evaluation**
+
+According to RAGEval [21]_, the evaluation dataset can be categorized into two types: traditional open-domain QA datasets and scenario-specific RAG evaluation datasets.
+
+Traditional open-domain QA datasets include:
+
+- HotPotQA: A dataset for multi-hop question answering.
+- Natural Questions: A dataset for open-domain question answering.
+- MS MARCO: A dataset for passage retrieval and question answering.
+- 2WikiMultiHopQA: A dataset for multi-hop question answering.
+- KILT: A benchmark for knowledge-intensive language tasks.
+
+Scenario-specific RAG evaluation datasets,
+
+- RGB: assesses LLMs’ ability to lever-age retrieved information, focusing on noise ro-bustness and information integration.
+- CRAG: increases domain coverage and introducesmock APIs to simulate real-world retrieval sce-narios.
+
 Evaluation Metrics
 ------------------------------------------
 
-The final question is how to evaluate.
 Evaluation methods can be divided into *automated evaluation* and *human evaluation* (*Chang et al.* [1]_ and *Liu et al.* [6]_).
+
 Automated evaluation typically involves using metrics such as accuracy and BERTScore or employing an LLM as the judge, to quantitatively assess the performance of LLMs on specific tasks.
 Human evaluation, on the other hand, involves human in the loop to evaluate the quality of the generated text or the performance of the LLM.
 
-Here, we recommend a few automated evaluation methods that can be used to evaluate LLMs and their applications.
+Here, we categorize the automated evaluation methods as follows:
 
-1. For classicial NLU tasks, such as text classification and sentiment analysis, you can use metrics such as accuracy, F1-score, and ROC-AUC to evaluate the performance of LLM response just like you would do using non-genAI models.
-You can check out `TorchMetrics <https://lightning.ai/docs/torchmetrics>`_.
+1. For classicial NLU tasks, such as text classification and sentiment analysis, you can use metrics such as accuracy, F1-score, and ROC-AUC to evaluate the performance of LLM response just like you would do using non-genAI models. You can check out `TorchMetrics <https://lightning.ai/docs/torchmetrics>`_.
 
 2. For NLG tasks, such as text summarization, translation, and question answering: (1) you can use metrics such as ROUGE, BLEU, METEOR, and BERTScore, perplexity, :class:`LLMasJudge <eval.llm_as_judge>` etc to evaluate the quality of the generated text with respect to the reference text.
-You can check out the metrics provided by `Hugging Face Metrics <https://huggingface.co/metrics>`_ or .
-For instance, to compute the BERTScore, you can use the corresponding metric function provided by Hugging Face, which uses the pre-trained contextual embeddings from BERT and matched words in generated text and reference text by cosine similarity.
-(2) When you have no reference text, :class:`LLMasJudge <eval.llm_as_judge>` with advanced model can be used to evaluate the generated text on the fly.
+   Or using :class:`GEvalLLMJudge <eval.g_eval>` to evaluate the generated text even without reference text.
 
-3. For RAG (Retrieval-Augmented Generation) pipelines, you can use metrics such as :class:`RetrieverRecall <eval.retriever_recall>`, :class:`RetrieverRelevance <eval.retriever_relevance>`, :class:`AnswerMatchAcc <eval.answer_match_acc>`, and :class:`LLMasJudge <eval.llm_as_judge>` to evaluate the quality of the retrieved context and the generated answer.
+3. For RAG (Retrieval-Augmented Generation) pipelines, you can use metrics such as :class:`RetrieverRecall <eval.retriever_recall>`, :class:`AnswerMatchAcc <eval.answer_match_acc>`, and :class:`LLMasJudge <eval.llm_as_judge>` to evaluate the quality of the retrieved context and the generated answer.
 
-NLG Evaluation Examples
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+You can also check out the metrics provided by `Hugging Face Metrics <https://huggingface.co/metrics>`_, `RAGAS <https://docs.ragas.io/en/stable/getstarted/index.html>`_,  `TorchMetrics <https://lightning.ai/docs/torchmetrics/stable/>`_, `ARES <https://arxiv.org/abs/2311.09476>`_, `SemScore <https://arxiv.org/abs/2401.17072>`_, `RGB <https://ojs.aaai.org/index.php/AAAI/article/view/29728>`_, etc.
+
+NLG Evaluation
+------------------------------------------
+
+Classicial String Metrics
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The simplest metric would be EM :class:`AnswerMatchAcc <eval.answer_match_acc>`: This calculates the exact match accuracy or fuzzy match accuracy of the generated answers by comparing them to the ground truth answers.
+
+
+More advanced traditional metrics, such as F1, BLEU [8]_, ROUGE [9]_, [20]_, and METEOR [12]_, may fail to capture the semantic similarity between the reference text and the generated text, resulting in low correlation with human judgment.
+
+You can use `TorchMetrics` [10]_ or `Hugging Face Metrics <https://huggingface.co/metrics>`_ to compute these metrics. For instance,
 
 .. code-block:: python
-    :linenos:
 
-    from datasets import load_metric
-    bertscore = load_metric("bertscore")
-    generated_text = ["life is good", "aim for the stars"]
-    reference_text = ["life is great", "make it to the moon"]
-    results = bertscore.compute(predictions=generated_text, references=reference_text, model_type="distilbert-base-uncased")
-    print(results)
+    gt = "Brazil has won 5 FIFA World Cup titles"
+    pred = "Brazil is the five-time champion of the FIFA WorldCup."
+
+    def compute_rouge(gt, pred):
+        from torchmetrics.text.rouge import ROUGEScore
+
+        rouge = ROUGEScore()
+        return rouge(pred, gt)
+
+
+    def compute_bleu(gt, pred):
+        from torchmetrics.text.bleu import BLEUScore
 
-The output will be a dictionary containing the precision, recall, and F1-score of the BERTScore metric for the generated text compared to the reference text.
+        bleu = BLEUScore()
+        return bleu([pred], [[gt]])
+
+The output Rouge score is:
 
 .. code-block:: json
 
-    {'precision': [0.9419728517532349, 0.7959791421890259], 'recall': [0.9419728517532349, 0.7749403119087219], 'f1': [0.9419728517532349, 0.7853187918663025], 'hashcode': 'distilbert-base-uncased_L5_no-idf_version=0.3.12(hug_trans=4.38.2)'}
+    {'rouge1_fmeasure': tensor(0.2222), 'rouge1_precision': tensor(0.2000), 'rouge1_recall': tensor(0.2500), 'rouge2_fmeasure': tensor(0.), 'rouge2_precision': tensor(0.), 'rouge2_recall': tensor(0.), 'rougeL_fmeasure': tensor(0.2222), 'rougeL_precision': tensor(0.2000), 'rougeL_recall': tensor(0.2500), 'rougeLsum_fmeasure': tensor(0.2222), 'rougeLsum_precision': tensor(0.2000), 'rougeLsum_recall': tensor(0.2500)}
+
+The output BLEU score is: 0.0
+
+These two sentences totally mean the same, but it scored low in BLEU and ROUGE.
 
-RAG Evaluation Examples
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-If you are particulay interested in evaluating RAG (Retrieval-Augmented Generation) pipelines, we have several metrics available in AdalFlow to assess both the quality of the retrieved context and the quality of the final generated answer.
+Embedding-based Metrics
+^^^^^^^^^^^^^^^^^^^^^^^^^
 
-For the retriever:
+To make up for this, embedding-based  metrics or neural evaluators such as BERTScore was created.
+You can find BERTScore in both `Hugging Face Metrics <https://huggingface.co/metrics>`_ and `TorchMetrics <https://lightning.ai/docs/torchmetrics/stable/text/bertscore.html>`_.
+BERTScore uses pre-trained contextual embeddings from BERT and matched words in generated text and reference text using cosine similarity.
 
-- :class:`RetrieverRecall <eval.retriever_recall>`: This is used to evaluate the recall of the retriever component of the RAG pipeline.
-- :class:`RetrieverRelevance <eval.retriever_relevance>`: This is used to evaluate the relevance of the retrieved context to the query.
 
-For the generator:
+.. code-block:: python
+
+    def compute_bertscore(gt, pred):
+        r"""
+        https://lightning.ai/docs/torchmetrics/stable/text/bert_score.html
+        """
+        from torchmetrics.text.bert import BERTScore
+
+        bertscore = BERTScore()
+        return bertscore([pred], [gt])
+
+The output BERT score is:
+
+.. code-block:: json
 
-- :class:`AnswerMatchAcc <eval.answer_match_acc>`: This calculates the exact match accuracy or fuzzy match accuracy of the generated answers by comparing them to the ground truth answers.
-- :class:`LLMasJudge <eval.llm_as_judge>`: This uses an LLM to get the judgement of the generated answer for a list of questions. The task description and the judgement query of the LLM judge can be customized. It computes the judgement score, which is the number of generated answers that are judged as correct by the LLM divided by the total number of generated answers.
+    {'precision': tensor(0.9752), 'recall': tensor(0.9827), 'f1': tensor(0.9789)}
 
-For example, you can use the following code snippet to compute the recall and relevance of the retriever component of the RAG pipeline for a single query.
+This score does reflect the semantic similarity between the two sentences almost perfectly.
+However, the downside of all the above metrics is that you need to have a reference text to compare with.
+Labeling, such as creating a reference text, can be quite challenging in many NLG tasks, such as summarization.
+
+
+
+
+LLM as Judge
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Evaluating an LLM application using an LLM as a judge is similar to building an LLM task pipeline.
+Developers need to understand the underlying prompt used by the LLM judge to determine whether the default judge is sufficient or if customization is required.
+
+After reviewing research papers and existing libraries, we found no solution that provides these evaluators with complete clarity without requiring developers to install numerous additional dependencies.
+With this in mind, AdalFlow decided to offer a comprehensive set of LLM evaluators rather than directing our developers to external evaluation packages.
+
+
+You can use an LLM as a judge in cases where you have a reference text or not.
+The key is to clearly define the metric using text.
+
+**We are developing LLM judge to replace human labelers, boosting efficiency and reducing financial costs.**
+
+
+
+The most straightforward LLM judge predicts a yes/no answer or a float score in range [0, 1] based on the comparison between the generated text and the reference text for a given judgment query.
+
+Here is AdalFlow's default judegement query:
+
+.. code-block:: python
+
+    DEFAULT_JUDGEMENT_QUERY = "Does the predicted answer contain the ground truth answer? Say True if yes, False if no."
+
+
+
+
+AdalFlow provides a very customizable LLM judge, which can be used in three ways:
+
+1. With question, ground truth, and generated text
+2. Without question, with ground truth, and generated text, mainly matching the ground truth and the generated text
+3. With question, without ground truth, with generated text, mainly matching between the questiona and the generated text
+
+And you can customize the `judgement_query` towards your use case or even the whole llm template.
+
+AdalFlow LLM judge returns `LLMJudgeEvalResult` which has three fields:
+1. `avg_score`: average score of the generated text
+2. `judgement_score_list`: list of scores for each generated text
+3. `confidence_interval`: a tuple of the 95% confidence interval of the scores
+
+
+`DefaultLLMJudge` is an LLM task pipeline that takes a single question(optional), ground truth(optional), and generated text and returns the float score in range [0,1].
+
+You can use it as an `eval_fn` for AdalFlow Trainer.
+
+`LLMAsJudge` is an evaluator that takes a list of inputs and returns a list of `LLMJudgeEvalResult`.
+Besides of the score, it computes the confidence interval of the scores.
+
+
+**Case 1: With References**
+
+Now, you can use the following code to calculate the final score based on the judgment query:
+
+
+.. code-block:: python
+
+    def compute_llm_as_judge():
+        import adalflow as adal
+        from adalflow.eval.llm_as_judge import LLMasJudge, DefaultLLMJudge
+        from adalflow.components.model_client import OpenAIClient
+
+        adal.setup_env()
+
+        questions = [
+            "Is Beijing in China?",
+            "Is Apple founded before Google?",
+            "Is earth flat?",
+        ]
+        pred_answers = ["Yes", "Yes, Appled is founded before Google", "Yes"]
+        gt_answers = ["Yes", "Yes", "No"]
+
+        llm_judge = DefaultLLMJudge(
+            model_client=OpenAIClient(),
+            model_kwargs={
+                "model": "gpt-4o",
+                "temperature": 1.0,
+                "max_tokens": 10,
+            },
+        )
+        llm_evaluator = LLMasJudge(llm_judge=llm_judge)
+        print(llm_judge)
+        eval_rslt = llm_evaluator.compute(
+            questions=questions, gt_answers=gt_answers, pred_answers=pred_answers
+        )
+        print(eval_rslt)
+
+To ensure more rigor, you can compute a 95% confidence interval for the judgment score. When the evaluation dataset is small, the confidence interval may have a large range, indicating that the judgment score is not very reliable.
+
+The output will be:
+
+.. code-block:: json
+
+    LLMJudgeEvalResult(avg_score=0.6666666666666666, judgement_score_list=[1, 1, 0], confidence_interval=(0.013333333333333197, 1))
+
+This type of LLM judeg is seen in text-grad [17]_.
+You can view the prompt we used simply using `print(llm_judge)`:
+
+.. code-block:: python
+
+    DefaultLLMJudge(
+        judgement_query= Does the predicted answer contain the ground truth answer? Say True if yes, False if no.,
+        (model_client): OpenAIClient()
+        (llm_evaluator): Generator(
+            model_kwargs={'model': 'gpt-4o', 'temperature': 1.0, 'max_tokens': 10}, trainable_prompt_kwargs=['task_desc_str', 'examples_str']
+            (prompt): Prompt(
+            template: <START_OF_SYSTEM_PROMPT>
+            {# task desc #}
+            {{task_desc_str}}
+            {# examples #}
+            {% if examples_str %}
+            {{examples_str}}
+            {% endif %}
+            <END_OF_SYSTEM_PROMPT>
+            ---------------------
+            <START_OF_USER>
+            {# question #}
+            {% if question_str is defined %}
+            Question: {{question_str}}
+            {% endif %}
+            {# ground truth answer #}
+            {% if gt_answer_str is defined %}
+            Ground truth answer: {{gt_answer_str}}
+            {% endif %}
+            {# predicted answer #}
+            Predicted answer: {{pred_answer_str}}
+            <END_OF_USER>
+            , prompt_kwargs: {'task_desc_str': 'You are an evaluator. Given the question(optional), ground truth answer(optional), and predicted answer, Does the predicted answer contain the ground truth answer? Say True if yes, False if no.', 'examples_str': None}, prompt_variables: ['task_desc_str', 'examples_str', 'pred_answer_str', 'question_str', 'gt_answer_str']
+            )
+            (model_client): OpenAIClient()
+        )
+    )
+
+
+**Case 2: Without Question**
+
+.. code-block:: python
+
+    def compute_llm_as_judge_wo_questions():
+        from adalflow.eval.llm_as_judge import LLMasJudge, DefaultLLMJudge
+        from adalflow.components.model_client import OpenAIClient
+
+
+        llm_judge = DefaultLLMJudge(
+            model_client=OpenAIClient(),
+            model_kwargs={
+                "model": "gpt-4o",
+                "temperature": 1.0,
+                "max_tokens": 10,
+            },
+            jugement_query="Does the predicted answer means the same as the ground truth answer? Say True if yes, False if no.",
+        )
+        llm_evaluator = LLMasJudge(llm_judge=llm_judge)
+        print(llm_judge)
+        eval_rslt = llm_evaluator.compute(gt_answers=[gt], pred_answers=[pred])
+        print(eval_rslt)
+
+The output will be:
+
+.. code-block:: json
+
+    LLMJudgeEvalResult(avg_score=1.0, judgement_score_list=[1], confidence_interval=(0, 1))
+
+
+
+G_Eval
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. figure:: /_static/images/G_eval_structure.png
+    :align: center
+    :alt: G-eval structure
+    :width: 700px
+
+    G-eval framework structure
+
+If you have no reference text, you can also use G-eval [11]_ to evaluate the generated text on the fly.
+G-eval provided a way to evaluate:
+
+- `relevance`: evaluates how relevant the summarized text to the source text.
+- `fluency`: the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure.
+- `consistency`: evaluates the collective quality of all sentences.
+- `coherence`: evaluates the the factual alignment between the summary and the summarized source.
+
+In our library, we provides the prompt for task `Summarization` and `Chatbot` as default.
+We also map the score to the range [0, 1] for the ease of optimization.
+
+Here is the code snippet to compute the G-eval score:
+
+.. code-block:: python
+
+    def compute_g_eval_summarization():
+        from adalflow.eval.g_eval import GEvalLLMJudge, GEvalJudgeEvaluator, NLGTask
+
+        model_kwargs = {
+            "model": "gpt-4o",
+            "n": 20,
+            "top_p": 1,
+            "max_tokens": 5,
+            "temperature": 1,
+        }
+
+        g_eval = GEvalLLMJudge(
+            default_task=NLGTask.SUMMARIZATION, model_kwargs=model_kwargs
+        )
+        print(g_eval)
+        input_template = """Source Document: {source}
+        Summary: {summary}
+        """
+
+        input_str = input_template.format(
+            source="Paul Merson has restarted his row with Andros Townsend after the Tottenham midfielder was brought on with only seven minutes remaining in his team 's 0-0 draw with Burnley on Sunday . 'Just been watching the game , did you miss the coach ? # RubberDub # 7minutes , ' Merson put on Twitter . Merson initially angered Townsend for writing in his Sky Sports column that 'if Andros Townsend can get in ( the England team ) then it opens it up to anybody . ' Paul Merson had another dig at Andros Townsend after his appearance for Tottenham against Burnley Townsend was brought on in the 83rd minute for Tottenham as they drew 0-0 against Burnley Andros Townsend scores England 's equaliser in their 1-1 friendly draw with Italy in Turin on Tuesday night The former Arsenal man was proven wrong when Townsend hit a stunning equaliser for England against Italy and he duly admitted his mistake . 'It 's not as though I was watching hoping he would n't score for England , I 'm genuinely pleased for him and fair play to him \u00e2\u20ac\u201c it was a great goal , ' Merson said . 'It 's just a matter of opinion , and my opinion was that he got pulled off after half an hour at Manchester United in front of Roy Hodgson , so he should n't have been in the squad . 'When I 'm wrong , I hold my hands up . I do n't have a problem with doing that - I 'll always be the first to admit when I 'm wrong . ' Townsend hit back at Merson on Twitter after scoring for England against Italy Sky Sports pundit Merson ( centre ) criticised Townsend 's call-up to the England squad last week Townsend hit back at Merson after netting for England in Turin on Wednesday , saying 'Not bad for a player that should be 'nowhere near the squad ' ay @ PaulMerse ? ' Any bad feeling between the pair seemed to have passed but Merson was unable to resist having another dig at Townsend after Tottenham drew at Turf Moor .",
+            summary="Paul merson was brought on with only seven minutes remaining in his team 's 0-0 draw with burnley . Andros townsend scored the tottenham midfielder in the 89th minute . Paul merson had another dig at andros townsend after his appearance . The midfielder had been brought on to the england squad last week . Click here for all the latest arsenal news news .",
+        )
+
+        g_evaluator = GEvalJudgeEvaluator(llm_judge=g_eval)
+
+        response = g_evaluator(input_strs=[input_str])
+        print(f"response: {response}")
+
+The output will be:
+
+.. code-block:: json
+
+    response: ({'Relevance': 0.4, 'Fluency': 0.3333333333333333, 'Consistency': 0.2, 'Coherence': 0.4, 'overall': 0.33333333333333337}, [{'Relevance': 0.4, 'Fluency': 0.3333333333333333, 'Consistency': 0.2, 'Coherence': 0.4, 'overall': 0.33333333333333337}])
+
+`print(g_eval)` will be:
+
+.. code-block:: python
+
+    GEvalLLMJudge(
+        default_task= NLGTask.SUMMARIZATION, prompt_kwargs={'Relevance': {'task_desc_str': 'You will be given a summary of a text.  Please evaluate the summary based on the following criteria:', 'evaluation_criteria_str': 'Relevance (1-5) - selection of important content from the source.\n        The summary should include only important information from the source document.\n        Annotators were instructed to penalize summaries which contained redundancies and excess information.', 'evaluation_steps_str': '1. Read the summary and the source document carefully.\n        2. Compare the summary to the source document and identify the main points of the article.\n        3. Assess how well the summary covers the main points of the article, and how much irrelevant or redundant information it contains.\n        4. Assign a relevance score from 1 to 5.', 'metric_name': 'Relevance'}, 'Fluency': {'task_desc_str': 'You will be given a summary of a text.  Please evaluate the summary based on the following criteria:', 'evaluation_criteria_str': 'Fluency (1-3): the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure.\n        - 1: Poor. The summary has many errors that make it hard to understand or sound unnatural.\n        - 2: Fair. The summary has some errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.\n        - 3: Good. The summary has few or no errors and is easy to read and follow.\n        ', 'evaluation_steps_str': None, 'metric_name': 'Fluency'}, 'Consistency': {'task_desc_str': 'You will be given a summary of a text.  Please evaluate the summary based on the following criteria:', 'evaluation_criteria_str': 'Consistency (1-5) - the factual alignment between the summary and the summarized source.\n        A factually consistent summary contains only statements that are entailed by the source document.\n        Annotators were also asked to penalize summaries that contained hallucinated facts. ', 'evaluation_steps_str': '1. Read the summary and the source document carefully.\n        2. Identify the main facts and details it presents.\n        3. Read the summary and compare it to the source document to identify any inconsistencies or factual errors that are not supported by the source.\n        4. Assign a score for consistency based on the Evaluation Criteria.', 'metric_name': 'Consistency'}, 'Coherence': {'task_desc_str': 'You will be given a summary of a text.  Please evaluate the summary based on the following criteria:', 'evaluation_criteria_str': 'Coherence (1-5) - the collective quality of all sentences.\n        We align this dimension with the DUC quality question of structure and coherence whereby "the summary should be well-structured and well-organized.\n        The summary should not just be a heap of related information, but should build from sentence to a coherent body of information about a topic.', 'evaluation_steps_str': '1. Read the input text carefully and identify the main topic and key points.\n        2. Read the summary and assess how well it captures the main topic and key points. And if it presents them in a clear and logical order.\n        3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.', 'metric_name': 'Coherence'}}
+        (model_client): OpenAIClient()
+        (llm_evaluator): Generator(
+            model_kwargs={'model': 'gpt-4o', 'n': 20, 'top_p': 1, 'max_tokens': 5, 'temperature': 1}, trainable_prompt_kwargs=[]
+            (prompt): Prompt(
+            template:
+            <START_OF_SYSTEM_PROMPT>
+            {# task desc #}
+            {{task_desc_str}}
+            ---------------------
+            {# evaluation criteria #}
+            Evaluation Criteria:
+            {{evaluation_criteria_str}}
+            ---------------------
+            {# evaluation steps #}
+            {% if evaluation_steps_str %}
+            Evaluation Steps:
+            {{evaluation_steps_str}}
+            ---------------------
+            {% endif %}
+            {{input_str}}
+            { # evaluation form #}
+            Evaluation Form (scores ONLY):
+            - {{metric_name}}:
+
+            Output the score only.
+            <END_OF_SYSTEM_PROMPT>
+            , prompt_variables: ['input_str', 'task_desc_str', 'evaluation_criteria_str', 'evaluation_steps_str', 'metric_name']
+            )
+            (model_client): OpenAIClient()
+            (output_processors): FloatParser()
+        )
+    )
+
+Train/Align LLM Judge
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We should better align the LLM judge with a human preference dataset that contains (generated text, ground truth text, score) triplets.
+This process is the same as optimizinh the task pipeline, where you can create an ``AdalComponent`` and call our ``Trainer`` to do the in-context learning.
+From the printout, you can observe the two trainable_prompt_kwargs in the ``DefaultLLMJudge``.
+
+In this case, we may want to compute a correlation score between the human judge and the LLM judge.
+You have various options, such as:
+
+1. Pearson Correlation Coefficient
+2. Kendallrank correlation coefficient from ARES [14]_, particularly useful for ranking systems (Retrieval).
+
+
+RAG Evaluation
+------------------------------------------
+RAG (Retrieval-Augmented Generation) pipelines are a combination of a retriever and a generator.
+The retriever retrieves relevant context from a large corpus, and the generator generates the final answer based on the retrieved context.
+When a retriever failed to retrieve relevant context, the generator may fail.
+Therefore, besides of evaluating RAG pipelines as a whole using NLG metrics, it is also important to evaluate the retriever and to optimize the evalulation metrics from both stages to best improve the final performance.
+
+With GT for Retriever
+^^^^^^^^^^^^^^^^^^^^^^^^^
+For the retriever, the metrics used are nothing new but from the standard information retrieval/ranking literature.
+Often, we have
+
+1. Recall@k: the proportion of relevant documents that are retrieved out of the total number of relevant documents.
+
+2. Mean Reciprocal Rank(MRR@k), HitRate@k, etc.
+
+3. NDCG@k
+
+4. Precision@k, MAP@k etc.
+
+For defails of these metrics, please refer to [18]_.
+All of these metrics, you can also find at `TorchMetrics <https://lightning.ai/docs/torchmetrics/stable/>`_.
+
+
+For example, you can use the following code snippet to compute the recall@k the retriever component of the RAG pipeline for a single query if
+the ground truth context is provided.
+In this example, the retrieved contexts is a joined string of the retrieved context chunks, and the gt_contexts is a list of ground truth context chunks for each query.
 
 .. code-block:: python
-    :linenos:
 
     from adalflow.eval import RetrieverRecall, RetrieverRelevance
 
@@ -132,16 +519,79 @@ For example, you can use the following code snippet to compute the recall and re
     retriever_recall = RetrieverRecall()
     avg_recall, recall_list = retriever_recall.compute(retrieved_contexts, gt_contexts) # Compute the recall of the retriever
     print(f"Recall: {avg_recall}, Recall List: {recall_list}")
-    # Recall: 0.6666666666666666, Recall List: [0.3333333333333333, 1.0]
-    retriever_relevance = RetrieverRelevance()
-    avg_relevance, relevance_list = retriever_relevance.compute(retrieved_contexts, gt_contexts) # Compute the relevance of the retriever
-    print(f"Relevance: {avg_relevance}, Relevance List: {relevance_list}")
-    # Relevance: 0.803030303030303, Relevance List: [1.0, 0.6060606060606061]
 
-For a more detailed instructions on how build and evaluate RAG pipelines, you can refer to the use case on :doc:`Evaluating a RAG Pipeline <../tutorials/eval_a_rag>`.
+The output will be:
+
+.. code-block:: json
+    Recall: 0.6666666666666666, Recall List: [0.3333333333333333, 1.0]
+
+For the first query, only one out of three relevant documents is retrieved, resulting in a recall of 0.33.
+For the second query, all relevant documents are retrieved, resulting in a recall of 1.0.
+
+
+Without gt_contexts
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+RAGAS
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Ideally, for each query, we will retrieve the top k (@k) chunks and to get the above score, we expect each query, retrieved chunk pair comes with a ground truth labeling.
+But this is highly unrealistic especially if corpora is large.
+If we have 100 test queries, and a corpus of size 1000 chunks, the pairs we need to annoate is 10^5.
+There are different strategies to handle this problem but we could not dive into all of them here.
+
+There is one new way is to indirectly use the ground truth answers from the generator to evaluate the retriever.
+`RAGAS <https://docs.ragas.io/en/stable/getstarted/index.html>`_ framework provides one way to do this.
+
+    Recall = [GT statements that can be attributed to the retrieved context] / [GT statements]
+
+
+LLM or model based judge for Retriever Recall
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 
-If you intent to use metrics that are not available in the AdalFlow library, you can also implement your own custom metric functions or use other libraries such as `RAGAS <https://docs.ragas.io/en/stable/getstarted/index.html>`_ to compute the desired metrics for evaluating RAG pipelines.
+**LLM judge with in-context prompting**
 
+LLM judge to directly straightforward way to evaluate the top k score on the fly.
+
+We can create a subset of query, retrieved chunk pairs and manually label them, and we train an LLM judge to predict the score.
+If the judge can achieve a high accuracy then we are able to annotate any metric in the retriever given the query and the retrieved chunk pairs.
+
+**ARES with finetuned classifier with synthetic data**
+
+ARES [14]_ proposed to create a synthetic dataset from an in-domain corpora.
+The generated data represent both positive and negative examples of `query–passage–answer triples`` (e.g.,relevant/irrelevant passages and correct/incorrectanswers).
+
+
+The synthetic dataset is used to train a classifier consists of  embedding and a classification head.
+It claims to be able to adapt to other domains where the classifier is not trained on.
+The cost of this approach is quite low as you can compute the embedding for only once for each query and each chunk in the corpus.
+
+**RAGEval for vertical domain evaluation**
+
+RAGEVal [21]_ proposed a framework to synthesize vertical domain evaluation dataset such as finance, healthcare, legal etc where due to the privacy, it is challenging to create a large real-world dataset.
+
+
+**More**
+
+See the evaluation on datasets at :doc:`Evaluating a RAG Pipeline <../tutorials/eval_a_rag>`.
+
+Additionally, there are more research for RAG evaluation, such as SemScore [13]_, ARES [14]_, RGB [15]_, etc.
+
+
+For Contributors
+------------------------------------------
+There are way too many metrics and evaluation methods that AdalFlow can cover in the library.
+We encourage contributors who work on evaluation research and production to build evaluator that is compatible with AdalFlow.
+This means that:
+
+1. The evaluator can potentially output a single float score in range [0, 1] so that AdalFlow Trainer can use it to optimize the pipeline.
+
+2. For using LLM as judge, the judge should be built similar to `DefaultLLMJudge` so that there are trainable_prompt_kwargs that users can further align the judge with human preference dataset.
+
+For instance, for the research papers we have listed here, it would be great to have a version that is easily compatible with AdalFlow.
+
+References
+------------------------------------------
 
 .. [1] Chang, Yupeng, et al. "A survey on evaluation of large language models." ACM Transactions on Intelligent Systems and Technology 15.3 (2024): 1-45.
 .. [2] Guo, Zishan, et al. "Evaluating large language models: A comprehensive survey." arXiv preprint arXiv:2310.19736 (2023).
@@ -149,3 +599,36 @@ If you intent to use metrics that are not available in the AdalFlow library, you
 .. [4] Hendrycks, Dan, et al. "Measuring massive multitask language understanding." International Conference on Learning Representations. 2020.
 .. [5] Chen, Mark, et al. "Evaluating large language models trained on code." arXiv preprint arXiv:2107.03374 (2021).
 .. [6] Liu, Yang, et al. "Datasets for Large Language Models: A Comprehensive Survey." arXiv preprint arXiv:2402.18041 (2024).
+.. [7] Finardi, Paulo, et al. "The Chronicles of RAG: The Retriever, the Chunk and the Generator." arXiv preprint arXiv:2401.07883 (2024).
+.. [8]  K. Papineni, S. Roukos, T. Ward, and W.-J. Zhu, “Bleu: a method for automatic evaluation of machine transla-tion,” in Proceedings of the 40th annual meeting on association for computational linguistics. Association for Computational Linguistics, 2002, pp. 311–318.
+.. [9]  C.-Y. Lin, “Rouge: a package for automatic evaluation of summaries,” 2004.
+.. [10] https://lightning.ai/docs/torchmetrics/stable/text/rouge_score.html
+.. [11] Y. Liu, D. Iter, Y. Xu, S. Wang, R. Xu, and C. Zhu, “G-eval: Nlg evaluation using gpt-4 with better humanalignment,” 2023.
+.. [12] Satanjeev Banerjee and Alon Lavie. 2005. Meteor: Anautomatic metric for mt evaluation with improved cor-relation with human judgments. In Proceedings ofthe acl workshop on intrinsic and extrinsic evaluationmeasures for machine translation and/or summariza-tion, pages 65–72.
+.. [13] SemScore: https://arxiv.org/abs/2401.17072
+.. [14] ARES: https://arxiv.org/abs/2311.09476, https://github.com/stanford-futuredata/ARES
+.. [15] RGB: https://ojs.aaai.org/index.php/AAAI/article/view/29728
+.. [16] G-eval: https://github.com/nlpyang/geval
+.. [17] Text-grad: https://arxiv.org/abs/2309.03409
+.. [18] Pretrained Transformers for Text Ranking: BERT and Beyond: https://arxiv.org/pdf/2010.06467
+.. [19] Liu, Yang, et al. "Datasets for large language models: A comprehensive survey." arXiv preprint arXiv:2402.18041 (2024).
+.. [20] ROUGE Deep dive: https://medium.com/nlplanet/two-minutes-nlp-learn-the-rouge-metric-by-examples-f179cc285499
+.. [21] Zhu, Kunlun, et al. "RAGEval: Scenario Specific RAG Evaluation Dataset Generation Framework." arXiv preprint arXiv:2408.01262 (2024).
+
+.. admonition:: AdalFlow Eval API Reference
+   :class: highlight
+
+   - :class:`RetrieverRecall <eval.retriever_recall>`
+   - :class:`DefaultLLMJudge <eval.llm_as_judge>`
+   - :class:`AnswerMatchAcc <eval.answer_match_acc>`
+   - :class:`GEvalLLMJudge <eval.g_eval>`
+   - :class:`GEvalJudgeEvaluator <eval.g_eval>`
+
+
+.. admonition:: Other Evaluation Metrics libraries
+   :class: highlight
+
+   - `TorchMetrics <https://lightning.ai/docs/torchmetrics>`_
+   - `Hugging Face Metrics <https://huggingface.co/metrics>`_
+   - `RAGAS <https://docs.ragas.io/en/stable/getstarted/index.html>`_
+   - `G-eval <https://arxiv.org/abs/2303.08774>`_
diff --git a/docs/source/tutorials/generator.rst b/docs/source/tutorials/generator.rst
index 26d18673..53c8e7fd 100644
--- a/docs/source/tutorials/generator.rst
+++ b/docs/source/tutorials/generator.rst
@@ -454,7 +454,7 @@ Besides these examples, LLM is like water, even in our library, we have componen
 
 - :class:`LLMRetriever<components.retriever.llm_retriever.LLMRetriever>` is a retriever that uses Generator to call LLM to retrieve the most relevant documents.
 - :class:`DefaultLLMJudge<eval.llm_as_judge.DefaultLLMJudge>` is a judge that uses Generator to call LLM to evaluate the quality of the response.
-- :class:`LLMOptimizer<optim.llm_optimizer.LLMOptimizer>` is an optimizer that uses Generator to call LLM to optimize the prompt.
+- :class:`TGDOptimizer<optim.text_grad.tgd_optimizer.TGDOptimizer>` is an optimizer that uses Generator to call LLM to optimize the prompt.
 - :class:`ReAct Agent Planner<components.agent.react.ReActAgent>` is an LLM planner that uses Generator to plan and to call functions in ReAct Agent.
 
 Tracing
@@ -495,5 +495,5 @@ It will require users to define ``Parameter`` and pass it to the ``prompt_kwargs
    - :class:`components.retriever.llm_retriever.LLMRetriever`
    - :class:`components.agent.react.ReActAgent`
    - :class:`eval.llm_as_judge.DefaultLLMJudge`
-   - :class:`optim.llm_optimizer.LLMOptimizer`
+   - :class:`optim.text_grad.tgd_optimizer.TGDOptimizer`
    - :func:`utils.config.new_component`
diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst
index 0512fb84..8abf7e56 100644
--- a/docs/source/tutorials/index.rst
+++ b/docs/source/tutorials/index.rst
@@ -215,7 +215,7 @@ Agent in ``components.agent`` is LLM great with reasoning, planning, and using t
 
 
 
-Optimizing
+Optimization
 -------------------
 AdalFlow auto-optimization provides a powerful and unified framework to optimize every single part of the prompt: (1) instruction, (2) few-shot examples, and (3) the prompt template,
 for any task pipeline you have just built. We leverage all SOTA prompt optimization from Dspy, Text-grad, ORPO, to our own research in the library.
@@ -227,7 +227,7 @@ The optimization requires users to have at least one dataset, an evaluator, and
 This section we will briefly cover the datasets and evaluation metrics supported in the library.
 
 
-Evaluating
+Evaluation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 You can not optimize what you can not meature.
 In this section, we provide a general guide to the evaluation datasets, metrics, and methods to productionize your LLM tasks and to publish your research.
diff --git a/docs/source/tutorials/rag.rst b/docs/source/tutorials/rag.rst
deleted file mode 100644
index e65c2c90..00000000
--- a/docs/source/tutorials/rag.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-.. _rag:
-
-RAG application
-===================
diff --git a/docs/source/tutorials/retriever_xy.rst b/docs/source/tutorials/retriever_xy.rst
deleted file mode 100644
index e2fa332e..00000000
--- a/docs/source/tutorials/retriever_xy.rst
+++ /dev/null
@@ -1,430 +0,0 @@
-.. _retriever_xy:
-
-Retriever
-===================
-
-In this tutorial, we will explain each component in ``LightRAG's Retriever`` and show you how to implement it in your LLM applications.
-
-LLMs develop fast, but they have limitations.
-
-**Content Window Limit:** Although the trend is, LLM models' content window keeps growing, there is still a context limit.
-
-**Signal to Noise Ratio** Meanwhile, LLMs perform better when the provided contents are relevant to the task.
-
-To improve LLMs performances in production, Retrieval Augmented Generation (RAG), a system that augments LLMs by adding extra context from another source, becomes popular.
-**Retrieval**, one of the most important components of RAG, is the process to fetch the extra relevant information to the model.
-The common solution for Retrieval is to chunk the documents into smaller contexts, store these pieces in databases such as vectorstore, Graph DB and Relational DB depending on the use case, and create significant embedding representations for these chunks in order to retrieve.
-
-``LightRAG`` aims to find the optimal way to pass the task-requiring data into LLMs.
-
-1. Document Splitter
-----------------------
-
-The DocumentSplitter in LightRAG is designed to preprocess text by splitting long documents into smaller chunks.
-This improves the performance of embedding models and ensures they operate within their maximum context length limits.
-
-``LightRAG's DocumentSplitter`` splits a list of documents (:obj:`core.base_data_class.Document`) into a list of shorter documents.
-The document object to manage id, document content,optional meta data, document's embedding vectors, etc.
-Instead of maintaining the complex relationship between parent, child, previous, and next documents, ``LightRAG`` mainly manages the related documents with ``parent_doc_id`` (id of the Document where the chunk is from) and ``order`` (order of the chunked document in the original document).
-
-**Key Arguments:**
-
-* ``split_by`` is the unit by which the document should be split. We implemented a string split function inside to break the text into a ``list``. The splitted ``list`` will get concatenated based on the specified ``split_length`` later.
-Check the following table for ``split_by`` options:
-
-.. list-table:: Text Splitting Options
-   :widths: 10 15 75
-   :header-rows: 1
-
-   * - Option
-     - Split by
-     - Example
-   * - **page**
-     - ``\f``
-     - ``Hello, world!\fNew page starts here.`` to ``['Hello, world!\x0c', 'New page starts here.']``
-   * - **passage**
-     - ``\n\n``
-     - ``Hello, world!\n\nNew paragraph starts here`` to ``['Hello, world!\n\n', 'New paragraph starts here.']``
-   * - **sentence**
-     - ``.``
-     - ``Hello, world. This is LightRAG.`` to ``['Hello, world.', ' This is LightRAG.', '']``
-   * - **word**
-     - ``<space>``
-     - ``Hello, world. This is LightRAG.`` to ``['Hello, ', 'world. ', 'This ', 'is ', 'LightRAG.']``
-
-We will use ``word`` in our example.
-
-* ``split_length`` is the the maximum number of units in each split.
-
-* ``split_overlap`` is the number of units that each split should overlap. Including context at the borders prevents sudden meaning shift in text between sentences/context, especially in sentiment analysis. In ``LightRAG`` we use ``windowed`` function in ``more-itertools`` package to build a sliding window for the texts to keep the overlaps. The window step size = ``split_length - split_overlap``.
-
-After splitting the long text into a list and using a sliding window to generate the text lists with specified overlap length, the text list will be concatenated into text pieces again.
-Here is a quick example:
-
-``Review: The theater service is terrible. The movie is good.`` Set ``split_by: word``, ``split_length: 6``, ``split_overlap: 2``.
-
-With our ``DocumentSplitter`` logic, the output will be: ``Review: The theater service is terrible.``, ``is terrible. The movie is good.``
-It prevents the model of misunderstand the context. If we don't have overlap, the second sentence will be ``The movie is good.`` and the embedding model might only consider this document is merely ``Positive``.
-
-Now let's see the code example. First, import the components.
-
-.. code:: python
-
-    from core.document_splitter import DocumentSplitter
-    from core.base_data_class import Document
-
-Then, configure the splitter settings.
-
-.. code:: python
-
-    text_splitter_settings = {
-        "split_by": "word",
-        "split_length": 15,
-        "split_overlap": 2,
-        }
-
-Next, define the document splitter and set up the documents.
-
-.. code:: python
-
-    text_splitter = DocumentSplitter(
-    split_by=text_splitter_settings["split_by"],
-    split_length=text_splitter_settings["split_length"],
-    split_overlap=text_splitter_settings["split_overlap"],
-    )
-
-    example1 = Document(
-        text="Review: I absolutely loved the friendly staff and the welcoming atmosphere! Sentiment: Positive",
-    )
-    example2 = Document(
-        text="Review: It was an awful experience, the food was bland and overpriced. Sentiment: Negative",
-    )
-    example3 = Document(
-        text="Review: What a fantastic movie! Had a great time and would watch it again! Sentiment: Positive",
-    )
-    example4 = Document(
-        text="Review: The store is not clean and smells bad. Sentiment: Negative",
-    )
-
-    documents = [example1, example2, example3, example4]
-
-Now you can use the splitter to create document chunks.
-
-.. code:: python
-
-    splitted_docs = (text_splitter.call(documents=documents))
-
-    # output:
-    # splitted_doc: [Document(id=15d838c4-abda-4c39-b81f-9cd745effb43, meta_data=None, text=Review: I absolutely loved the friendly staff and the welcoming atmosphere! Sentiment: Positive, estimated_num_tokens=17), Document(id=e4850140-8762-4972-9bae-1dfe96ccb65f, meta_data=None, text=Review: It was an awful experience, the food was bland and overpriced. Sentiment: Negative, estimated_num_tokens=21), Document(id=6bd772b9-88b4-4dfa-a595-922c0f8a4efb, meta_data=None, text=Review: What a fantastic movie! Had a great time and would watch it again! Sentiment: , estimated_num_tokens=21), Document(id=b0d98c1b-13ac-4c92-882e-2ed0196b0c81, meta_data=None, text=again! Sentiment: Positive, estimated_num_tokens=6), Document(id=fdc2429b-17e7-4c00-991f-f89e0955e3a3, meta_data=None, text=Review: The store is not clean and smells bad. Sentiment: Negative, estimated_num_tokens=15)]
-
-2. Embedder
-----------------
-
-Now we have splitted long documents to shorter ones, the next part is to retrieve the relevant documents.
-But how can we find "relevant" texts? A commonly applied approach in the NLP field is Embedding.
-
-For ``Embedder`` tutorial, please check `Embedder <./embedder.html>`_.
-
-3. LightRAG Retrievers
-------------------------
-Given a query, the retriever is responsible to fetch the relevant documents.
-Now we have document splitter and embedder, we can check the retrievers now.
-LightRAG provides ``FAISSRetriever``, ``BM25Retriever``, and ``LLMRetriever``.
-These retrievers are built on the basic :class:`Retriever`, with default index building and retrieve phases.
-All these retrievers return a list of ``RetrieverOutput``, including indexes, scores, query and documents.
-
-#. FAISSRetriever
-
-The ``FAISSRetriever`` uses in-memory Faiss index to retrieve the top k chunks(see `research <https://github.com/facebookresearch/faiss>`_). It is particularly useful in applications involving large-scale vector.
-The developers need to configure ``top_k``, ``dimensions`` and ``vectorizer`` first.
-``vectorizer`` is basically an instance of the ``Embedder``. The ``FAISSRetriever`` itself will initialize ``faiss.IndexFlatIP`` with the specified ``dimensions`` to do `Exact Search for Inner Product`.
-
-LightRAG's ``FAISSRetriever`` provides :func:`build_index_from_documents <components.retriever.faiss_retriever.FAISSRetriever.build_index_from_documents>` to create index from embeddings(``vector`` field of each document).
-It will create ``xb`` indexes(the same number with embeddings). After the indexes are added, the index state will be ``True``.
-
-Then, developers can pass the queries to :func:`retrieve <components.retriever.faiss_retriever.FAISSRetriever.retrieve>`. This function embeds the queries, and performs inner product search for ``xq``(the number of queries) queries and return k most close vectors.
-We choose cosine similarity and convert it to range [0, 1] by adding 1 and dividing by 2 to simulate probability. This is how we calculate the score.
-Then we attach the score to each retrieval output.
-
-Then, to speed up the retrieval, it is a common practice to build indexes from the documents or chunks.
-When the indexes are ready, we should pass the query to the retriever and get the top k documents closest to the query vector.
-
-Here is an example:
-
-.. code-block:: python
-
-    from lightrag.core.embedder import Embedder
-    from lightrag.components.model_client import OpenAIClient
-    from lightrag.core.data_components import ToEmbedderResponse, ToEmbeddings
-    from lightrag.core.types import Document
-    from lightrag.core.document_splitter import DocumentSplitter
-    from lightrag.components.retriever import FAISSRetriever
-
-    import dotenv
-    dotenv.load_dotenv(dotenv_path=".env", override=True)
-
-    import os
-    os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
-
-    # To use ``FAISSRetriever``, we need to prepare the embeddings
-    # for documents or chunks following the previous steps.
-
-    # configure the splitter setting
-    text_splitter_settings = {
-            "split_by": "word",
-            "split_length": 200,
-            "split_overlap": 100,
-            }
-
-    # set up the document splitter
-    text_splitter = DocumentSplitter(
-        split_by=text_splitter_settings["split_by"],
-        split_length=text_splitter_settings["split_length"],
-        split_overlap=text_splitter_settings["split_overlap"],
-        )
-
-    doc1 = Document(
-        meta_data={"title": "Luna's Profile"},
-        text="lots of more nonsense text." * 50
-        + "Luna is a domestic shorthair."
-        + "lots of nonsense text." * 100
-        + "Luna loves to eat Tuna."
-        + "lots of nonsense text." * 50,
-        id="doc1",
-        )
-    doc2 = Document(
-        meta_data={"title": "Luna's Hobbies"},
-        text="lots of more nonsense text." * 50
-        + "Luna loves to eat lickable treats."
-        + "lots of more nonsense text." * 50
-        + "Luna loves to play cat wand."
-        + "lots of more nonsense text." * 50
-        + "Luna likes to sleep all the afternoon",
-        id="doc2",
-    )
-    documents = [doc1, doc2]
-
-    # split the documents
-    splitted_docs = (text_splitter.call(documents=documents))
-
-    # configure the vectorizer(embedding) setting
-    vectorizer_settings = {
-        "model_kwargs": {
-            "model": "text-embedding-3-small",
-            "dimensions": 256,
-            "encoding_format": "float",
-        },
-        "batch_size": 100
-    }
-
-    # set up the embedder using openai model
-    vectorizer = Embedder(
-            model_client=OpenAIClient,
-            model_kwargs=vectorizer_settings["model_kwargs"], # set up model arguments
-            output_processors=ToEmbedderResponse(), # convert the model output to EmbedderResponse
-        )
-    # Prepare embeddings for the documents
-    embedder_response_processor = ToEmbeddings(
-        vectorizer=vectorizer,
-        batch_size=vectorizer_settings["batch_size"],
-    )
-
-    # Apply embedding transformation
-    embeddings = embedder_response_processor(splitted_docs)
-
-    # Initialize the FAISS retriever with the embeddings
-    faiss_retriever = FAISSRetriever(
-        top_k=2,
-        dimensions=vectorizer_settings["model_kwargs"]["dimensions"],
-        vectorizer=vectorizer
-    )
-
-    # build indexes for the documents
-    faiss_retriever.build_index_from_documents(embeddings)
-
-    # set up queries
-    queries = ["what does luna like to eat?"]
-
-    # get the retrieved results
-    faiss_query_result = faiss_retriever.retrieve(query_or_queries=queries)
-
-    # Continue with the rest of your original code
-    print(f"*" * 50)
-    print("Faiss Retrieval Results:")
-    for result in faiss_query_result:
-        print(f"Query: {result.query}")
-        print(f"Document Indexes: {result.doc_indexes}, Scores: {result.doc_scores}")
-        # Fetch and print the document texts corresponding to the retrieved indexes
-        for idx in result.doc_indexes:
-            print(f"Document ID: {splitted_docs[idx].id} - Title: {splitted_docs[idx].meta_data['title']}")
-            print(f"Text: {splitted_docs[idx].text}")  # Print first 200 characters of the document text
-
-        print(f"*" * 50)
-
-    # **************************************************
-    # Faiss Retrieval Results:
-    # Query: what does luna like to eat?
-    # Document Indexes: [8 2], Scores: [0.741 0.724]
-    # Document ID: e3f04c8b-68ae-4dde-844a-439037e58842 - Title: Luna's Hobbies
-    # Text: text. Luna loves to eat lickable treats.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more
-    # Document ID: f2d0f52a-4e69-4cc5-8f78-4499fa22525d - Title: Luna's Profile
-    # Text: text.Luna is a domestic shorthair.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots
-    # **************************************************
-
-#. BM25Retriever
-
-The ``BM25Retriever`` leverages the `Okapi BM25 algorithm(Best Matching 25 ranking) <https://en.wikipedia.org/wiki/Okapi_BM25>`_, a widely-used ranking function in information retrieval that is particularly effective in contexts where document relevance to a query is crucial.
-
-This retriever is initialized with parameters that fine-tune its behavior:
-
-``top_k``: Number of top documents to retrieve.
-``k1``: Controls term frequency saturation.
-``b```: Part of the BM25 algorithm that controls the influence of document length on term frequency normalization. Larger b means lengthier documents have more impact on its effect. 0.5 < b < 0.8 is suggested to yields reasonably good results.
-``alpha``: Sets a cutoff for the IDF scores, filtering out terms that are too common to be informative.
-IDF refers to `Inverse document frequency <https://en.wikipedia.org/wiki/Tf%E2%80%93idf>`_. It measures how much information the word provides.
-Lower the IDF score means the word is used a lot and less important in the document.
-Please check :class:`BM25Retriever` to see how we calculate the IDF score.
-``split_function``: Tokenization is customizable via the ``split_function``, which defaults to splitting text by tokens. Here's an example using a custom tokenizer:
-The following example shows how the token splitting works. This tokenizer converts text into a series of token IDs, which are numeric representations of the tokens.
-
-.. code-block:: python
-
-    from lightrag.core.tokenizer import Tokenizer
-    from typing import List
-    def split_text_by_token_fn(tokenizer: Tokenizer, x: str) -> List[str]:
-        return tokenizer(x)
-
-    tokenizer = Tokenizer(name="o200k_base")
-    sentence =  "Hello world. This is LightRAG."
-    print(split_text_by_token_fn(tokenizer=tokenizer, x=sentence))
-
-    # [13225, 2375, 13, 1328, 382, 12936, 49, 2971, 13], these numbers represent token ids
-
-Tokenization can be customized through ``split_function``.
-
-Similar to ``FAISSRetriever``, developers can build index from documents. In ``BM25Retriever`` allows direct documents inputs without need for preparing embeddings beforehand.
-The ``build_index_from_documents`` first tokenizes the documents, then analyzes each to compute token frequencies necessary for IDF calculation.
-And we filter the IDF based on the specified ``alpha``.
-The ``t2d`` represents the token and its frequency in documents.
-For example, t2d={"apple":{0:1}} means, the word apple appears once in the 0th document.
-With the frequency we can calculate idf. The ``idf`` dictionary is to record the idf score for each token, such as {"apple": 0.9}, it means in the corpus, the token apple has idf score=0.9.
-
-``load_index``, ``save_index`` and ``reset_index`` are supported.
-
-
-When a query is received, each token of the query is first transformed into its corresponding token using the same ``split_function`` configured during initialization.
-
-If a token from the query also appears in the documents of the corpus,
-the retriever iterates over the documents containing the token,
-applying the BM25 formula to calculate and accumulate scores based on the token's frequency.
-For instance, document 1 = "apple, apple, banana", document 2 = "apple, orange".
-If the query is "apple, orange", the score of document 1 be the accumulated score from 2 "apple". The score of document 2 will be the accumulated score from "apple" and "orange".
-The document's score increases for each occurrence of these tokens.
-This cumulative scoring approach ensures that documents containing more query-related tokens are ranked higher.
-Finally, the ``k`` documents with the highest cumulative scores are identified and returned in a ``RetrieverOutput``,
-which means most relevant to the query.
-
-#. LLMRetriever
-
-Unlike ``FAISSRetriever`` and ``BM25Retriever``, the ``LLMRetriever`` utilizes LLM models to perform retrieval.
-
-This model-driven approach does not rely on traditional similarity/IDF scores but instead uses the model's understanding of the content.
-
-Besides ``top_k``, developers need to configure the generator arguments to call LLMs, including:
-``model_client``: Model provider such as OpenAIClient, or GroqAPIClient.
-``model_kwargs``: Model related arguments such the ``temperature``.
-``template``: The prompt template used in the generator to guide the model's focus during retrieval.
-``preset_prompt_kwargs``: Includes preset arguments for prompt customization, such as ``task_desc_str`` for task descriptions and ``input_str`` for user queries.
-``output_processors``: A component by default ``ListParser`` that processes the model's output into a list of document indices. You should configure this parser based on how you instruct the model to output in the prompt.
-
-**Index Building:** When ``build_info_from_documents`` is called, the retriever configures a designed prompt that informs the model of the documents' context. This enables the model to understand and organize the information before any query is processed.
-**Retrieve:** Developers can submit queries as a list. The queries will be processed by using the configured model and template.
-The retrieve phase will return the k most relevant **document indices** based on the context provided during indexing.
-Developers should be aware of the flexibility of prompt instruction and ``output_processors`` setting and process the output indices.
-
-Here is an example for ``LLMRetriever``:
-
-.. code-block:: python
-
-    from lightrag.components.model_client import OpenAIClient
-    from lightrag.core.types import Document
-    from lightrag.core.document_splitter import DocumentSplitter
-    from lightrag.components.retriever import LLMRetriever
-    from lightrag.core.string_parser import ListParser
-
-    import dotenv
-    dotenv.load_dotenv(dotenv_path=".env", override=True)
-
-    # Document preparation and splitting
-    splitter_settings = {"split_by": "word", "split_length": 200, "split_overlap": 100}
-    text_splitter = DocumentSplitter(**splitter_settings)
-    documents = [
-        Document(id="doc1", meta_data={"title": "Luna's Profile"}, text=
-                "lots of more nonsense text." * 50
-                + "Luna is a domestic shorthair."
-                + "lots of nonsense text." * 50
-                + "Luna loves to eat Tuna."
-                + "lots of nonsense text." * 50),
-        Document(id="doc2", meta_data={"title": "Luna's Hobbies"}, text=
-                "lots of more nonsense text." * 50
-                + "Luna loves to eat lickable treats."
-                + "lots of more nonsense text." * 50
-                + "Luna loves to play cat wand."
-                + "lots of more nonsense text." * 50
-                + "Luna likes to sleep all the afternoon"),
-    ]
-
-    # split the documents
-    splitted_docs = text_splitter.call(documents)
-
-    # configure the model
-    gpt_model_kwargs = {
-            "model": "gpt-3.5-turbo",
-            "temperature": 0.0,
-        }
-    # set up the retriever
-    llm_retriever = LLMRetriever(
-        top_k=1,
-        model_client=OpenAIClient(),
-        model_kwargs=gpt_model_kwargs,
-        output_processors = ListParser()
-    )
-
-    # build indexes for the splitted documents
-    llm_retriever.build_index_from_documents(documents=splitted_docs)
-
-    # set up queries
-    queries = ["what does luna like to eat?", "what does Luna look like?"]
-
-
-    # get the retrieved list of GeneratorOutput, each contains list of indices
-    llm_query_output = llm_retriever.retrieve(query_or_queries=queries)
-    # print(llm_query_indices)
-    print("*" * 50)
-    for query, result in zip(queries, llm_query_output):
-        result = result.data # get list of indices from generatoroutput
-        print(f"Query: {query}")
-        if result:
-            # Retrieve the indices from the result
-            document_indices = result
-            for idx in document_indices:
-                # Ensure the index is within the range of splitted_docs
-                if idx < len(splitted_docs):
-                    doc = splitted_docs[idx]
-                    print(f"Document ID: {doc.id} - Title: {doc.meta_data['title']}")
-                    print(f"Text: {doc.text}")  # Print the first 200 characters
-                else:
-                    print(f"Index {idx} out of range.")
-        else:
-            print("No documents retrieved for this query.")
-        print("*" * 50)
-
-    # **************************************************
-    # Query: what does luna like to eat?
-    # Document ID: 557cc52b-a2b7-4780-bbc3-f1be8330c167 - Title: Luna's Profile
-    # Text: text.Luna is a domestic shorthair.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.Luna loves to eat Tuna.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense
-    # **************************************************
-    # Query: what does Luna look like?
-    # Document ID: 7de4b00a-e539-4df0-adc9-b4c312bed365 - Title: Luna's Profile
-    # Text: text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.Luna is a domestic shorthair.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense
-    # **************************************************
diff --git a/docs/source/use_cases/build_a_rag.rst b/docs/source/use_cases/build_a_rag.rst
new file mode 100644
index 00000000..ce15bb49
--- /dev/null
+++ b/docs/source/use_cases/build_a_rag.rst
@@ -0,0 +1,41 @@
+.. <a href="https://colab.research.google.com/drive/1gmxeX1UuUxZDouWhkLGQYrD4hAdt9IVX?usp=sharing" target="_blank" style="margin-right: 10px;">
+..     <img alt="Try Quickstart in Colab" src="https://colab.research.google.com/assets/colab-badge.svg" style="vertical-align: middle;">
+.. </a>
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: flex-start; align-items: center; margin-bottom: 20px;">
+
+      <a href="https://github.com/SylphAI-Inc/AdalFlow/blob/main/use_cases/rag/build" target="_blank" style="display: flex; align-items: center;">
+         <img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" style="height: 20px; width: 20px; margin-right: 5px;">
+         <span style="vertical-align: middle;"> Open Source Code</span>
+      </a>
+   </div>
+
+Designing RAG
+================
+
+Retrieval-Augmented Generation (RAG) is a paradigm that combines the strengths of retrieval and generation models.
+Given a user query, RAG retrieves relevant passages from a large corpus and then generates a response based on the retrieved passages.
+This formulation opens up a wide range of use cases such as conversational search engine, question answering on a customized knowledge base,
+customer support, fact-checking.
+RAGs eliminate the hallucination and offers a degree of transparency and interpretability via citing the sources.
+
+However, the flexibility of the RAG also means that it requires careful design and tuning to achieve optimal performance.
+For each use case, we need to answer:
+
+1. What retrieval to use? And how many stages it should be? Do we need a reranker or even LLM to help with the retrieval stages?
+
+2. Which cloud-database can go well with the retrieval strategy and be able to scale?
+
+3. How do I evaluate the performance of the RAG as a whole? And what metrics can help me understand the retrieval stage?
+
+4. Do I need query expansion or any other techniques to improve the retrieval performance?
+
+5. How do I optimize the RAG hyperparameters such as the number of retrieved passages, the size of the chunk, and the overlap between chunks, or even the chunking strategy?
+
+6. Sometimes you need to even create your own customized/finetuned embedding models. How do I do that?
+
+7. How do I auto-optimize the RAG pipeline with In-context learning(ICLs) with zero-shot prompting and few-shot prompting?
+
+8. What about finetuning? How to do it and would it be more token efficient or more effective?
diff --git a/docs/source/use_cases/eval_a_rag.rst b/docs/source/use_cases/eval_a_rag.rst
index 0a91ce5b..f413039e 100644
--- a/docs/source/use_cases/eval_a_rag.rst
+++ b/docs/source/use_cases/eval_a_rag.rst
@@ -1,8 +1,22 @@
-Evaluating a RAG Pipeline
+Evaluating RAG
 ==========================
 
+As RAG admits so many design choices, it is difficult to build a traditional benchmark dataset from scratch.
+When it comes to evaluating RAG, we have to define the metrics, the evaluation datasets, and to annotate such datasets and ensure the metrics can reflect
+
 In LightRAG, we provide a set of metrics in :ref:`our evaluators <evaluators>`. In this tutorial, we will show how to use them to evaluate the performance of the retriever and generator components of a RAG pipeline
 
+As RAG consists of two stages:
+
+1. Retrieval: The retriever fetches relevant context from a knowledge base.
+   The metrics used here are nothing new but from the standard information retrieval literature.
+   Often, we have Mean Reciprocal Rank(MRR@k), Recall@k, Precision@k, F1@k, MAP@k, NDCG@k, etc.
+   Please read our :ref:`Retriever <tutorials-retriever>` for more details on the retrieval itself.
+   All of these metrics, you can find at `TorchMetrics <https://lightning.ai/docs/torchmetrics/stable/>`_.
+
+2. Generation: The metrics used for evaluating the response is more diverse and highly dependent on the tasks.
+   You can refer to our :ref:`Evaluation guidelines <tutorials-llm-evaluation>` for more details.
+
 The full code for this tutorial can be found in `use_cases/rag_hotpotqa.py <https://github.com/SylphAI-Inc/LightRAG/blob/main/use_cases/rag_hotpotqa.py>`_.
 
 RAG (Retrieval-Augmented Generation) pipelines leverage a retriever to fetch relevant context from a knowledge base (e.g., a document database) which is then fed to an LLM generator with the query to produce the answer. This allows the model to generate more contextually relevant answers.
@@ -183,3 +197,11 @@ Note that :obj:`task_desc_str` and :obj:`judgement_query` can be customized.
 
 **Conclusion.**
 In this tutorial, we learned how to evaluate a RAG pipeline using the HotpotQA dataset. We walked through the code and explained each step of the evaluation process. You can use this tutorial as a starting point to evaluate your own RAG pipelines and customize the evaluation metrics based on your requirements.
+
+.. admonition:: API References
+   :class: highlight
+
+.. admonition:: References
+   :class: highlight
+
+   .. [1] Finardi, Paulo, et al. "The Chronicles of RAG: The Retriever, the Chunk and the Generator." arXiv preprint arXiv:2401.07883 (2024).
diff --git a/docs/source/use_cases/index.rst b/docs/source/use_cases/index.rst
index c6740040..419bdf8b 100644
--- a/docs/source/use_cases/index.rst
+++ b/docs/source/use_cases/index.rst
@@ -8,6 +8,30 @@ Use Cases
 
 We will build use cases end-to-end, ranging from classification (classical NLP tasks) to question answering, retrieval-augmented generation (RAG), and multi-generator pipelines.
 
+..
+  RAG
+  ----------------
+  .. list-table::
+    :widths: 30 70
+    :header-rows: 1
+
+    * - Part
+      - Description
+    * - :doc:`build_a_rag`
+      - Designing a RAG pipeline, from offline data processing to online inference.
+    * - :doc:`eval_a_rag`
+      - Question Answering with `bhh_hard_object_count` dataset, including textual-gradient descent and few-shot boostrap optimization.
+
+  .. toctree::
+    :maxdepth: 1
+    :caption: RAG vibe
+    :hidden:
+
+    build_a_rag
+    eval_a_rag
+
+Optimization
+----------------
 
 .. list-table::
    :widths: 30 70
@@ -23,19 +47,16 @@ We will build use cases end-to-end, ranging from classification (classical NLP t
      - RAG and multi-hop question answering with hotpotqa dataset, two generators, and one retriever, optimizing zero-shot and few-shot learning (coming soon).
 
 
+
+
+
 .. toctree::
    :maxdepth: 1
-   :caption: Use Cases
+   :caption: End-to-End
    :hidden:
 
+
+
    question_answering
    classification
    rag_opt
-
-
-
-
-.. :maxdepth: 2
-
-.. eval_a_rag
-.. introduction_to_basedataclass
diff --git a/notebooks/README.md b/notebooks/README.md
new file mode 100644
index 00000000..8c62f43b
--- /dev/null
+++ b/notebooks/README.md
@@ -0,0 +1,37 @@
+This is where all our colab notebookes are gonna be tracked as `ipynb` files.
+
+There are still other notebooks in both `tutorials/` and `use_cases` directories that we will migrate to here.
+
+## Objective
+
+Jupyter notebooks/colabs will be in complementary to documents on our [documentation website](https://adalflow.sylph.ai) and its source code at either `tutorials/` or `use_cases/`. It is designed to have less text compared with documents and more showcasing the code and results.
+
+
+## Structure
+
+We provided a colab template at `notebooks/adalflow_colab_template` that you can make a copy use:
+
+`cp notebooks/adalflow_colab_template.ipynb notebooks/your_new_colab.ipynb`.
+
+The template consists of three parts:
+
+1. Welcome to AdalFlow with library intro, outline, and installation along with environment setup.
+2. Content section of your notebook. Link to Next that users can look at.
+3. Issues and Feedback.
+
+
+## If you need to use dev api
+
+You can go to adalflow dir and do
+
+```bash
+poetry build
+```
+
+And use
+
+```bash
+pip install your_path/dist/adalflow-0.1.0-py3-none-any.whl
+```
+
+to install the package.
diff --git a/notebooks/adalflow_colab_template.ipynb b/notebooks/adalflow_colab_template.ipynb
new file mode 100644
index 00000000..480d5b1a
--- /dev/null
+++ b/notebooks/adalflow_colab_template.ipynb
@@ -0,0 +1,125 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 🤗 Welcome to AdalFlow!\n",
+    "## The PyTorch library to auto-optimize any LLM task pipelines\n",
+    "\n",
+    "Thanks for trying us out, we're here to provide you with the best LLM application development experience you can dream of 😊 any questions or concerns you may have, [come talk to us on discord,](https://discord.gg/ezzszrRZvT) we're always here to help! ⭐ <i>Star us on <a href=\"https://github.com/SylphAI-Inc/AdalFlow\">Github</a> </i> ⭐\n",
+    "\n",
+    "\n",
+    "# Quick Links\n",
+    "\n",
+    "Github repo: https://github.com/SylphAI-Inc/AdalFlow\n",
+    "\n",
+    "Full Tutorials: https://adalflow.sylph.ai/index.html#.\n",
+    "\n",
+    "Deep dive on each API: check out the [developer notes](https://adalflow.sylph.ai/tutorials/index.html).\n",
+    "\n",
+    "Common use cases along with the auto-optimization:  check out [Use cases](https://adalflow.sylph.ai/use_cases/index.html).\n",
+    "\n",
+    "# Outline\n",
+    "\n",
+    "This is a quick introduction of what AdalFlow is capable of. We will cover:\n",
+    "\n",
+    "* Simple Chatbot with structured output\n",
+    "* RAG task pipeline + Data processing pipeline\n",
+    "* Agent\n",
+    "\n",
+    "**Next: Try our [auto-optimization](https://colab.research.google.com/drive/1n3mHUWekTEYHiBdYBTw43TKlPN41A9za?usp=sharing)**\n",
+    "\n",
+    "\n",
+    "# Installation\n",
+    "\n",
+    "1. Use `pip` to install the `adalflow` Python package. We will need `openai`, `groq`, and `faiss`(cpu version) from the extra packages.\n",
+    "\n",
+    "  ```bash\n",
+    "  pip install adalflow[openai,groq,faiss-cpu]\n",
+    "  ```\n",
+    "2. Setup  `openai` and `groq` API key in the environment variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import clear_output\n",
+    "\n",
+    "!pip install -U adalflow[openai,groq,faiss-cpu]\n",
+    "\n",
+    "clear_output()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Set Environment Variables\n",
+    "\n",
+    "Run the following code and pass your api key.\n",
+    "\n",
+    "Note: for normal `.py` projects, follow our [official installation guide](https://lightrag.sylph.ai/get_started/installation.html).\n",
+    "\n",
+    "*Go to [OpenAI](https://platform.openai.com/docs/introduction) and [Groq](https://console.groq.com/docs/) to get API keys if you don't already have.*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from getpass import getpass\n",
+    "\n",
+    "# Prompt user to enter their API keys securely\n",
+    "openai_api_key = getpass(\"Please enter your OpenAI API key: \")\n",
+    "groq_api_key = getpass(\"Please enter your GROQ API key: \")\n",
+    "\n",
+    "\n",
+    "# Set environment variables\n",
+    "os.environ['OPENAI_API_KEY'] = openai_api_key\n",
+    "os.environ['GROQ_API_KEY'] = groq_api_key\n",
+    "\n",
+    "print(\"API keys have been set.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 😇 Your first section"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 🤗  Your second section"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Issues and feedback\n",
+    "\n",
+    "If you encounter any issues, please report them here: [GitHub Issues](https://github.com/SylphAI-Inc/LightRAG/issues).\n",
+    "\n",
+    "For feedback, you can use either the [GitHub discussions](https://github.com/SylphAI-Inc/LightRAG/discussions) or [Discord](https://discord.gg/ezzszrRZvT)."
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/evaluation/adalflow_llm_eval.ipynb b/notebooks/evaluation/adalflow_llm_eval.ipynb
new file mode 100644
index 00000000..5e903978
--- /dev/null
+++ b/notebooks/evaluation/adalflow_llm_eval.ipynb
@@ -0,0 +1,659 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 🤗 Welcome to AdalFlow!\n",
+    "## The PyTorch library to auto-optimize any LLM task pipelines\n",
+    "\n",
+    "Thanks for trying us out, we're here to provide you with the best LLM application development experience you can dream of 😊 any questions or concerns you may have, [come talk to us on discord,](https://discord.gg/ezzszrRZvT) we're always here to help! ⭐ <i>Star us on <a href=\"https://github.com/SylphAI-Inc/AdalFlow\">Github</a> </i> ⭐\n",
+    "\n",
+    "\n",
+    "# Quick Links\n",
+    "\n",
+    "Github repo: https://github.com/SylphAI-Inc/AdalFlow\n",
+    "\n",
+    "Full Tutorials: https://adalflow.sylph.ai/index.html#.\n",
+    "\n",
+    "Deep dive on each API: check out the [developer notes](https://adalflow.sylph.ai/tutorials/index.html).\n",
+    "\n",
+    "Common use cases along with the auto-optimization:  check out [Use cases](https://adalflow.sylph.ai/use_cases/index.html).\n",
+    "\n",
+    "# Outline\n",
+    "This is the colab complementary to:\n",
+    "* [LLM evaluation guideline](https://adalflow.sylph.ai/tutorials/evaluation.html)\n",
+    "* [Source code](https://github.com/SylphAI-Inc/AdalFlow/tree/main/tutorials/evaluation)\n",
+    "\n",
+    "\n",
+    "Introducing LLM evaluations with a focus on the generative tasks instead of classical Natural language understanding tasks.\n",
+    "\n",
+    "* Natural language Generation(NLG) metrics\n",
+    "* RAG evaluation:\n",
+    "    * RAG AnswerMatch\n",
+    "    * RAG Retriever Recall\n",
+    "\n",
+    "\n",
+    "\n",
+    "# Installation\n",
+    "\n",
+    "1. Use `pip` to install the `adalflow` Python package. We will need `openai`, `groq`, and `faiss`(cpu version) from the extra packages.\n",
+    "\n",
+    "  ```bash\n",
+    "  pip install adalflow[openai,groq,faiss-cpu]\n",
+    "  ```\n",
+    "2. Setup  `openai` and `groq` API key in the environment variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ensure version >= v0.2.1\n",
+    "from IPython.display import clear_output\n",
+    "\n",
+    "!pip install -U adalflow[openai]\n",
+    "\n",
+    "clear_output()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Set Environment Variables\n",
+    "\n",
+    "Run the following code and pass your api key.\n",
+    "\n",
+    "Note: for normal `.py` projects, follow our [official installation guide](https://lightrag.sylph.ai/get_started/installation.html).\n",
+    "\n",
+    "*Go to [OpenAI](https://platform.openai.com/docs/introduction) and [Groq](https://console.groq.com/docs/) to get API keys if you don't already have.*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "API keys have been set.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "from getpass import getpass\n",
+    "\n",
+    "# Prompt user to enter their API keys securely\n",
+    "openai_api_key = getpass(\"Please enter your OpenAI API key: \")\n",
+    "\n",
+    "\n",
+    "# Set environment variables\n",
+    "os.environ['OPENAI_API_KEY'] = openai_api_key\n",
+    "\n",
+    "print(\"API keys have been set.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 😇 Classical Text metrics and issues\n",
+    "\n",
+    "We will use `Torchmetrics` to compute the classical text metrics like BLEU, ROUGE.\n",
+    "\n",
+    "We choose a case where the ground truth(references) means the same as the generated text, but where BLEU and ROUGE are not able to capture the similarity."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install torchmetrics\n",
+    "\n",
+    "clear_output()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gt = \"Brazil has won 5 FIFA World Cup titles\"\n",
+    "pred = \"Brazil is the five-time champion of the FIFA WorldCup.\"\n",
+    "\n",
+    "\n",
+    "def compute_rouge(gt, pred):\n",
+    "    r\"\"\"\n",
+    "    https://lightning.ai/docs/torchmetrics/stable/text/rouge_score.html\n",
+    "    \"\"\"\n",
+    "    from torchmetrics.text.rouge import ROUGEScore\n",
+    "\n",
+    "    rouge = ROUGEScore()\n",
+    "    return rouge(pred, gt)\n",
+    "\n",
+    "\n",
+    "def compute_bleu(gt, pred):\n",
+    "    r\"\"\"\n",
+    "    https://lightning.ai/docs/torchmetrics/stable/text/bleu_score.html\n",
+    "    \"\"\"\n",
+    "    from torchmetrics.text.bleu import BLEUScore\n",
+    "\n",
+    "    bleu = BLEUScore()\n",
+    "    # preds = [\"the cat is on the mat\"]\n",
+    "    # target = [[\"there is a cat on the mat\", \"a cat is on the mat\"]]\n",
+    "    # score = bleu(preds, target)\n",
+    "    # print(f\"score: {score}\")\n",
+    "    # print(f\"pred: {[pred]}, gt: {[[gt]]}\")\n",
+    "    return bleu([pred], [[gt]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'rouge1_fmeasure': tensor(0.2222),\n",
+       " 'rouge1_precision': tensor(0.2000),\n",
+       " 'rouge1_recall': tensor(0.2500),\n",
+       " 'rouge2_fmeasure': tensor(0.),\n",
+       " 'rouge2_precision': tensor(0.),\n",
+       " 'rouge2_recall': tensor(0.),\n",
+       " 'rougeL_fmeasure': tensor(0.2222),\n",
+       " 'rougeL_precision': tensor(0.2000),\n",
+       " 'rougeL_recall': tensor(0.2500),\n",
+       " 'rougeLsum_fmeasure': tensor(0.2222),\n",
+       " 'rougeLsum_precision': tensor(0.2000),\n",
+       " 'rougeLsum_recall': tensor(0.2500)}"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "compute_rouge(gt, pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor(0.)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "compute_bleu(gt, pred)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 🤗  Embedding-based Metrics -- BERTScore\n",
+    "\n",
+    "To make up for this, embedding-based  metrics or neural evaluators such as BERTScore was created.\n",
+    "You can find BERTScore in both `Hugging Face Metrics <https://huggingface.co/metrics>`_ and `TorchMetrics <https://lightning.ai/docs/torchmetrics/stable/text/bertscore.html>`_.\n",
+    "BERTScore uses pre-trained contextual embeddings from BERT and matched words in generated text and reference text using cosine similarity."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_bertscore(gt, pred):\n",
+    "    r\"\"\"\n",
+    "    https://lightning.ai/docs/torchmetrics/stable/text/bert_score.html\n",
+    "    \"\"\"\n",
+    "    from torchmetrics.text.bert import BERTScore\n",
+    "\n",
+    "    bertscore = BERTScore()\n",
+    "    return bertscore([pred], [gt])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/liyin/Documents/test/LightRAG/.venv/lib/python3.12/site-packages/torchmetrics/utilities/prints.py:43: UserWarning: The argument `model_name_or_path` was not specified while it is required when the default `transformers` model is used. It will use the default recommended model - 'roberta-large'.\n",
+      "  warnings.warn(*args, **kwargs)  # noqa: B028\n",
+      "/Users/liyin/Documents/test/LightRAG/.venv/lib/python3.12/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'precision': tensor(0.9752), 'recall': tensor(0.9827), 'f1': tensor(0.9789)}"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "compute_bertscore(gt, pred)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 🤗  LLM As Judge\n",
+    "\n",
+    "AdalFlow provides a very customizable LLM judge, which can be used in three ways:\n",
+    "\n",
+    "1. With question, ground truth, and generated text\n",
+    "2. Without question, with ground truth, and generated text\n",
+    "3. Without question, without ground truth, with generated text\n",
+    "\n",
+    "And you can customize the `judgement_query` towards your use case or even the whole llm template.\n",
+    "\n",
+    "AdalFlow LLM judge returns `LLMJudgeEvalResult` which has three fields:\n",
+    "1. `avg_score`: average score of the generated text\n",
+    "2. `judgement_score_list`: list of scores for each generated text\n",
+    "3. `confidence_interval`: a tuple of the 95% confidence interval of the scores\n",
+    "\n",
+    "\n",
+    "`DefaultLLMJudge` is an LLM task pipeline that takes a single question(optional), ground truth(optional), and generated text and returns the float score in range [0,1].\n",
+    "\n",
+    "You can use it as an `eval_fn` for AdalFlow Trainer.\n",
+    "\n",
+    "`LLMAsJudge` is an evaluator that takes a list of inputs and returns a list of `LLMJudgeEvalResult`.\n",
+    "Besides of the score, it computes the confidence interval of the scores."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# without questions, and with customized judgement query\n",
+    "\n",
+    "def compute_llm_as_judge_wo_questions():\n",
+    "    from adalflow.eval.llm_as_judge import LLMasJudge, DefaultLLMJudge\n",
+    "    from adalflow.components.model_client import OpenAIClient\n",
+    "\n",
+    "\n",
+    "    llm_judge = DefaultLLMJudge(\n",
+    "        model_client=OpenAIClient(),\n",
+    "        model_kwargs={\n",
+    "            \"model\": \"gpt-4o\",\n",
+    "            \"temperature\": 1.0,\n",
+    "            \"max_tokens\": 10,\n",
+    "        },\n",
+    "        jugement_query=\"Does the predicted answer means the same as the ground truth answer? Say True if yes, False if no.\",\n",
+    "    )\n",
+    "    llm_evaluator = LLMasJudge(llm_judge=llm_judge)\n",
+    "    print(llm_judge)\n",
+    "    eval_rslt = llm_evaluator.compute(gt_answers=[gt], pred_answers=[pred])\n",
+    "    print(eval_rslt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DefaultLLMJudge(\n",
+      "  judgement_query= Does the predicted answer means the same as the ground truth answer? Say True if yes, False if no., \n",
+      "  (model_client): OpenAIClient()\n",
+      "  (llm_evaluator): Generator(\n",
+      "    model_kwargs={'model': 'gpt-4o', 'temperature': 1.0, 'max_tokens': 10}, trainable_prompt_kwargs=['task_desc_str', 'examples_str']\n",
+      "    (prompt): Prompt(\n",
+      "      template: <START_OF_SYSTEM_PROMPT>\n",
+      "      {# task desc #}\n",
+      "      {{task_desc_str}}\n",
+      "      {# examples #}\n",
+      "      {% if examples_str %}\n",
+      "      {{examples_str}}\n",
+      "      {% endif %}\n",
+      "      <END_OF_SYSTEM_PROMPT>\n",
+      "      ---------------------\n",
+      "      <START_OF_USER>\n",
+      "      {# question #}\n",
+      "      {% if question_str is defined %}\n",
+      "      Question: {{question_str}}\n",
+      "      {% endif %}\n",
+      "      {# ground truth answer #}\n",
+      "      {% if gt_answer_str is defined %}\n",
+      "      Ground truth answer: {{gt_answer_str}}\n",
+      "      {% endif %}\n",
+      "      {# predicted answer #}\n",
+      "      Predicted answer: {{pred_answer_str}}\n",
+      "      <END_OF_USER>\n",
+      "      , prompt_kwargs: {'task_desc_str': 'You are an evaluator. Given the question(optional), ground truth answer(optional), and predicted answer, Does the predicted answer means the same as the ground truth answer? Say True if yes, False if no.', 'examples_str': None}, prompt_variables: ['task_desc_str', 'examples_str', 'pred_answer_str', 'question_str', 'gt_answer_str']\n",
+      "    )\n",
+      "    (model_client): OpenAIClient()\n",
+      "  )\n",
+      ")\n",
+      "true\n",
+      "LLMJudgeEvalResult(avg_score=1.0, judgement_score_list=[1], confidence_interval=(0, 1))\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/liyin/Documents/test/LightRAG/.venv/lib/python3.12/site-packages/numpy/core/_methods.py:206: RuntimeWarning: Degrees of freedom <= 0 for slice\n",
+      "  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,\n",
+      "/Users/liyin/Documents/test/LightRAG/.venv/lib/python3.12/site-packages/numpy/core/_methods.py:198: RuntimeWarning: invalid value encountered in scalar divide\n",
+      "  ret = ret.dtype.type(ret / rcount)\n"
+     ]
+    }
+   ],
+   "source": [
+    "compute_llm_as_judge_wo_questions()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# with questions and default judgement query\n",
+    "def compute_llm_as_judge():\n",
+    "    from adalflow.eval.llm_as_judge import LLMasJudge, DefaultLLMJudge\n",
+    "    from adalflow.components.model_client import OpenAIClient\n",
+    "\n",
+    "    questions = [\n",
+    "        \"Is Beijing in China?\",\n",
+    "        \"Is Apple founded before Google?\",\n",
+    "        \"Is earth flat?\",\n",
+    "    ]\n",
+    "    pred_answers = [\"Yes\", \"Yes, Appled is founded before Google\", \"Yes\"]\n",
+    "    gt_answers = [\"Yes\", \"Yes\", \"No\"]\n",
+    "\n",
+    "    llm_judge = DefaultLLMJudge(\n",
+    "        model_client=OpenAIClient(),\n",
+    "        model_kwargs={\n",
+    "            \"model\": \"gpt-4o\",\n",
+    "            \"temperature\": 1.0,\n",
+    "            \"max_tokens\": 10,\n",
+    "        },\n",
+    "    )\n",
+    "    llm_evaluator = LLMasJudge(llm_judge=llm_judge)\n",
+    "    print(llm_judge)\n",
+    "    eval_rslt = llm_evaluator.compute(\n",
+    "        questions=questions, gt_answers=gt_answers, pred_answers=pred_answers\n",
+    "    )\n",
+    "    print(eval_rslt)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DefaultLLMJudge(\n",
+      "  judgement_query= Does the predicted answer contain the ground truth answer? Say True if yes, False if no., \n",
+      "  (model_client): OpenAIClient()\n",
+      "  (llm_evaluator): Generator(\n",
+      "    model_kwargs={'model': 'gpt-4o', 'temperature': 1.0, 'max_tokens': 10}, trainable_prompt_kwargs=['task_desc_str', 'examples_str']\n",
+      "    (prompt): Prompt(\n",
+      "      template: <START_OF_SYSTEM_PROMPT>\n",
+      "      {# task desc #}\n",
+      "      {{task_desc_str}}\n",
+      "      {# examples #}\n",
+      "      {% if examples_str %}\n",
+      "      {{examples_str}}\n",
+      "      {% endif %}\n",
+      "      <END_OF_SYSTEM_PROMPT>\n",
+      "      ---------------------\n",
+      "      <START_OF_USER>\n",
+      "      {# question #}\n",
+      "      {% if question_str is defined %}\n",
+      "      Question: {{question_str}}\n",
+      "      {% endif %}\n",
+      "      {# ground truth answer #}\n",
+      "      {% if gt_answer_str is defined %}\n",
+      "      Ground truth answer: {{gt_answer_str}}\n",
+      "      {% endif %}\n",
+      "      {# predicted answer #}\n",
+      "      Predicted answer: {{pred_answer_str}}\n",
+      "      <END_OF_USER>\n",
+      "      , prompt_kwargs: {'task_desc_str': 'You are an evaluator. Given the question(optional), ground truth answer(optional), and predicted answer, Does the predicted answer contain the ground truth answer? Say True if yes, False if no.', 'examples_str': None}, prompt_variables: ['task_desc_str', 'examples_str', 'pred_answer_str', 'question_str', 'gt_answer_str']\n",
+      "    )\n",
+      "    (model_client): OpenAIClient()\n",
+      "  )\n",
+      ")\n",
+      "true\n",
+      "true\n",
+      "false\n",
+      "LLMJudgeEvalResult(avg_score=0.6666666666666666, judgement_score_list=[1, 1, 0], confidence_interval=(0.013333333333333197, 1))\n"
+     ]
+    }
+   ],
+   "source": [
+    "compute_llm_as_judge()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 🤩 G-eval\n",
+    "\n",
+    "If you have no reference text, you can also use G-eval [11]_ to evaluate the generated text on the fly.\n",
+    "G-eval provided a way to evaluate:\n",
+    "\n",
+    "- `relevance`: evaluates how relevant the summarized text to the source text.\n",
+    "- `fluency`: the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure.\n",
+    "- `consistency`: evaluates the collective quality of all sentences.\n",
+    "- `coherence`: evaluates the the factual alignment between the summary and the summarized source.\n",
+    "\n",
+    "In our library, we provides the prompt for task `Summarization` and `Chatbot` as default.\n",
+    "We also map the score to the range [0, 1] for the ease of optimization.\n",
+    "\n",
+    "Here is the code snippet to compute the G-eval score:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_g_eval_summarization(source, summary):\n",
+    "    from adalflow.eval.g_eval import GEvalLLMJudge, GEvalJudgeEvaluator, NLGTask\n",
+    "\n",
+    "    model_kwargs = {\n",
+    "        \"model\": \"gpt-4o\",\n",
+    "        \"n\": 20,\n",
+    "        \"top_p\": 1,\n",
+    "        \"max_tokens\": 5,\n",
+    "        \"temperature\": 1,\n",
+    "    }\n",
+    "\n",
+    "    g_eval = GEvalLLMJudge(\n",
+    "        default_task=NLGTask.SUMMARIZATION, model_kwargs=model_kwargs\n",
+    "    )\n",
+    "    print(g_eval)\n",
+    "    input_template = \"\"\"Source Document: {source}\n",
+    "    Summary: {summary}\n",
+    "    \"\"\"\n",
+    "\n",
+    "    input_str = input_template.format(\n",
+    "        source=source,\n",
+    "        summary=summary,\n",
+    "    )\n",
+    "\n",
+    "    g_evaluator = GEvalJudgeEvaluator(llm_judge=g_eval)\n",
+    "\n",
+    "    response = g_evaluator(input_strs=[input_str])\n",
+    "    print(f\"response: {response}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "GEvalLLMJudge(\n",
+      "  default_task= NLGTask.SUMMARIZATION, prompt_kwargs={'Relevance': {'task_desc_str': 'You will be given a summary of a text.  Please evaluate the summary based on the following criteria:', 'evaluation_criteria_str': 'Relevance (1-5) - selection of important content from the source.\\n        The summary should include only important information from the source document.\\n        Annotators were instructed to penalize summaries which contained redundancies and excess information.', 'evaluation_steps_str': '1. Read the summary and the source document carefully.\\n        2. Compare the summary to the source document and identify the main points of the article.\\n        3. Assess how well the summary covers the main points of the article, and how much irrelevant or redundant information it contains.\\n        4. Assign a relevance score from 1 to 5.', 'metric_name': 'Relevance'}, 'Fluency': {'task_desc_str': 'You will be given a summary of a text.  Please evaluate the summary based on the following criteria:', 'evaluation_criteria_str': 'Fluency (1-3): the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure.\\n        - 1: Poor. The summary has many errors that make it hard to understand or sound unnatural.\\n        - 2: Fair. The summary has some errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.\\n        - 3: Good. The summary has few or no errors and is easy to read and follow.\\n        ', 'evaluation_steps_str': None, 'metric_name': 'Fluency'}, 'Consistency': {'task_desc_str': 'You will be given a summary of a text.  Please evaluate the summary based on the following criteria:', 'evaluation_criteria_str': 'Consistency (1-5) - the factual alignment between the summary and the summarized source.\\n        A factually consistent summary contains only statements that are entailed by the source document.\\n        Annotators were also asked to penalize summaries that contained hallucinated facts. ', 'evaluation_steps_str': '1. Read the summary and the source document carefully.\\n        2. Identify the main facts and details it presents.\\n        3. Read the summary and compare it to the source document to identify any inconsistencies or factual errors that are not supported by the source.\\n        4. Assign a score for consistency based on the Evaluation Criteria.', 'metric_name': 'Consistency'}, 'Coherence': {'task_desc_str': 'You will be given a summary of a text.  Please evaluate the summary based on the following criteria:', 'evaluation_criteria_str': 'Coherence (1-5) - the collective quality of all sentences.\\n        We align this dimension with the DUC quality question of structure and coherence whereby \"the summary should be well-structured and well-organized.\\n        The summary should not just be a heap of related information, but should build from sentence to a coherent body of information about a topic.', 'evaluation_steps_str': '1. Read the input text carefully and identify the main topic and key points.\\n        2. Read the summary and assess how well it captures the main topic and key points. And if it presents them in a clear and logical order.\\n        3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.', 'metric_name': 'Coherence'}}\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (llm_evaluator): Generator(\n",
+      "    model_kwargs={'model': 'gpt-4o', 'n': 20, 'top_p': 1, 'max_tokens': 5, 'temperature': 1}, trainable_prompt_kwargs=[]\n",
+      "    (prompt): Prompt(\n",
+      "      template: \n",
+      "      <START_OF_SYSTEM_PROMPT>\n",
+      "      {# task desc #}\n",
+      "      {{task_desc_str}}\n",
+      "      ---------------------\n",
+      "      {# evaluation criteria #}\n",
+      "      Evaluation Criteria:\n",
+      "      {{evaluation_criteria_str}}\n",
+      "      ---------------------\n",
+      "      {# evaluation steps #}\n",
+      "      {% if evaluation_steps_str %}\n",
+      "      Evaluation Steps:\n",
+      "      {{evaluation_steps_str}}\n",
+      "      ---------------------\n",
+      "      {% endif %}\n",
+      "      {{input_str}}\n",
+      "      { # evaluation form #}\n",
+      "      Output the score for metric (scores ONLY!): {{metric_name}}\n",
+      "      <END_OF_SYSTEM_PROMPT>\n",
+      "      , prompt_variables: ['input_str', 'metric_name', 'evaluation_criteria_str', 'task_desc_str', 'evaluation_steps_str']\n",
+      "    )\n",
+      "    (model_client): OpenAIClient()\n",
+      "    (output_processors): FloatParser()\n",
+      "  )\n",
+      ")\n",
+      "response: ({'Relevance': 0.4, 'Fluency': 0.6666666666666666, 'Consistency': 0.6, 'Coherence': 0.6, 'overall': 0.5666666666666667}, [{'Relevance': 0.4, 'Fluency': 0.6666666666666666, 'Consistency': 0.6, 'Coherence': 0.6, 'overall': 0.5666666666666667}])\n"
+     ]
+    }
+   ],
+   "source": [
+    "source=\"Paul Merson has restarted his row with Andros Townsend after the Tottenham midfielder was brought on with only seven minutes remaining in his team 's 0-0 draw with Burnley on Sunday . 'Just been watching the game , did you miss the coach ? # RubberDub # 7minutes , ' Merson put on Twitter . Merson initially angered Townsend for writing in his Sky Sports column that 'if Andros Townsend can get in ( the England team ) then it opens it up to anybody . ' Paul Merson had another dig at Andros Townsend after his appearance for Tottenham against Burnley Townsend was brought on in the 83rd minute for Tottenham as they drew 0-0 against Burnley Andros Townsend scores England 's equaliser in their 1-1 friendly draw with Italy in Turin on Tuesday night The former Arsenal man was proven wrong when Townsend hit a stunning equaliser for England against Italy and he duly admitted his mistake . 'It 's not as though I was watching hoping he would n't score for England , I 'm genuinely pleased for him and fair play to him \\u00e2\\u20ac\\u201c it was a great goal , ' Merson said . 'It 's just a matter of opinion , and my opinion was that he got pulled off after half an hour at Manchester United in front of Roy Hodgson , so he should n't have been in the squad . 'When I 'm wrong , I hold my hands up . I do n't have a problem with doing that - I 'll always be the first to admit when I 'm wrong . ' Townsend hit back at Merson on Twitter after scoring for England against Italy Sky Sports pundit Merson ( centre ) criticised Townsend 's call-up to the England squad last week Townsend hit back at Merson after netting for England in Turin on Wednesday , saying 'Not bad for a player that should be 'nowhere near the squad ' ay @ PaulMerse ? ' Any bad feeling between the pair seemed to have passed but Merson was unable to resist having another dig at Townsend after Tottenham drew at Turf Moor .\",\n",
+    "summary=\"Paul merson was brought on with only seven minutes remaining in his team 's 0-0 draw with burnley . Andros townsend scored the tottenham midfielder in the 89th minute . Paul merson had another dig at andros townsend after his appearance . The midfielder had been brought on to the england squad last week . Click here for all the latest arsenal news news .\",\n",
+    "\n",
+    "compute_g_eval_summarization(source=source, summary=summary)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "GEvalLLMJudge(\n",
+      "  default_task= NLGTask.SUMMARIZATION, prompt_kwargs={'Relevance': {'task_desc_str': 'You will be given a summary of a text.  Please evaluate the summary based on the following criteria:', 'evaluation_criteria_str': 'Relevance (1-5) - selection of important content from the source.\\n        The summary should include only important information from the source document.\\n        Annotators were instructed to penalize summaries which contained redundancies and excess information.', 'evaluation_steps_str': '1. Read the summary and the source document carefully.\\n        2. Compare the summary to the source document and identify the main points of the article.\\n        3. Assess how well the summary covers the main points of the article, and how much irrelevant or redundant information it contains.\\n        4. Assign a relevance score from 1 to 5.', 'metric_name': 'Relevance'}, 'Fluency': {'task_desc_str': 'You will be given a summary of a text.  Please evaluate the summary based on the following criteria:', 'evaluation_criteria_str': 'Fluency (1-3): the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure.\\n        - 1: Poor. The summary has many errors that make it hard to understand or sound unnatural.\\n        - 2: Fair. The summary has some errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.\\n        - 3: Good. The summary has few or no errors and is easy to read and follow.\\n        ', 'evaluation_steps_str': None, 'metric_name': 'Fluency'}, 'Consistency': {'task_desc_str': 'You will be given a summary of a text.  Please evaluate the summary based on the following criteria:', 'evaluation_criteria_str': 'Consistency (1-5) - the factual alignment between the summary and the summarized source.\\n        A factually consistent summary contains only statements that are entailed by the source document.\\n        Annotators were also asked to penalize summaries that contained hallucinated facts. ', 'evaluation_steps_str': '1. Read the summary and the source document carefully.\\n        2. Identify the main facts and details it presents.\\n        3. Read the summary and compare it to the source document to identify any inconsistencies or factual errors that are not supported by the source.\\n        4. Assign a score for consistency based on the Evaluation Criteria.', 'metric_name': 'Consistency'}, 'Coherence': {'task_desc_str': 'You will be given a summary of a text.  Please evaluate the summary based on the following criteria:', 'evaluation_criteria_str': 'Coherence (1-5) - the collective quality of all sentences.\\n        We align this dimension with the DUC quality question of structure and coherence whereby \"the summary should be well-structured and well-organized.\\n        The summary should not just be a heap of related information, but should build from sentence to a coherent body of information about a topic.', 'evaluation_steps_str': '1. Read the input text carefully and identify the main topic and key points.\\n        2. Read the summary and assess how well it captures the main topic and key points. And if it presents them in a clear and logical order.\\n        3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.', 'metric_name': 'Coherence'}}\n",
+      "  (model_client): OpenAIClient()\n",
+      "  (llm_evaluator): Generator(\n",
+      "    model_kwargs={'model': 'gpt-4o', 'n': 20, 'top_p': 1, 'max_tokens': 5, 'temperature': 1}, trainable_prompt_kwargs=[]\n",
+      "    (prompt): Prompt(\n",
+      "      template: \n",
+      "      <START_OF_SYSTEM_PROMPT>\n",
+      "      {# task desc #}\n",
+      "      {{task_desc_str}}\n",
+      "      ---------------------\n",
+      "      {# evaluation criteria #}\n",
+      "      Evaluation Criteria:\n",
+      "      {{evaluation_criteria_str}}\n",
+      "      ---------------------\n",
+      "      {# evaluation steps #}\n",
+      "      {% if evaluation_steps_str %}\n",
+      "      Evaluation Steps:\n",
+      "      {{evaluation_steps_str}}\n",
+      "      ---------------------\n",
+      "      {% endif %}\n",
+      "      {{input_str}}\n",
+      "      { # evaluation form #}\n",
+      "      Output the score for metric (scores ONLY!): {{metric_name}}\n",
+      "      <END_OF_SYSTEM_PROMPT>\n",
+      "      , prompt_variables: ['input_str', 'metric_name', 'evaluation_criteria_str', 'task_desc_str', 'evaluation_steps_str']\n",
+      "    )\n",
+      "    (model_client): OpenAIClient()\n",
+      "    (output_processors): FloatParser()\n",
+      "  )\n",
+      ")\n",
+      "response: ({'Relevance': 1.0, 'Fluency': 1.0, 'Consistency': 1.0, 'Coherence': 0.8, 'overall': 0.95}, [{'Relevance': 1.0, 'Fluency': 1.0, 'Consistency': 1.0, 'Coherence': 0.8, 'overall': 0.95}])\n"
+     ]
+    }
+   ],
+   "source": [
+    "compute_g_eval_summarization(source=gt, summary=pred)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Issues and feedback\n",
+    "\n",
+    "If you encounter any issues, please report them here: [GitHub Issues](https://github.com/SylphAI-Inc/LightRAG/issues).\n",
+    "\n",
+    "For feedback, you can use either the [GitHub discussions](https://github.com/SylphAI-Inc/LightRAG/discussions) or [Discord](https://discord.gg/ezzszrRZvT)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/poetry.lock b/poetry.lock
index 4bde037f..e2c0cd94 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -276,6 +276,17 @@ doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphin
 test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
 trio = ["trio (>=0.23)"]
 
+[[package]]
+name = "appdirs"
+version = "1.4.4"
+description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
+optional = false
+python-versions = "*"
+files = [
+    {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"},
+    {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"},
+]
+
 [[package]]
 name = "appnope"
 version = "0.1.4"
@@ -713,6 +724,20 @@ files = [
     {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"},
 ]
 
+[[package]]
+name = "click"
+version = "8.1.7"
+description = "Composable command line interface toolkit"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
+    {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
 [[package]]
 name = "cohere"
 version = "5.8.0"
@@ -860,6 +885,21 @@ files = [
 docs = ["ipython", "matplotlib", "numpydoc", "sphinx"]
 tests = ["pytest", "pytest-cov", "pytest-xdist"]
 
+[[package]]
+name = "dataclasses-json"
+version = "0.6.7"
+description = "Easily serialize dataclasses to and from JSON."
+optional = false
+python-versions = "<4.0,>=3.7"
+files = [
+    {file = "dataclasses_json-0.6.7-py3-none-any.whl", hash = "sha256:0dbf33f26c8d5305befd61b39d2b3414e8a407bedc2834dea9b8d642666fb40a"},
+    {file = "dataclasses_json-0.6.7.tar.gz", hash = "sha256:b6b3e528266ea45b9535223bc53ca645f5208833c29229e847b3f26a1cc55fc0"},
+]
+
+[package.dependencies]
+marshmallow = ">=3.18.0,<4.0.0"
+typing-inspect = ">=0.4.0,<1"
+
 [[package]]
 name = "dataclasses-jsonschema"
 version = "2.16.0"
@@ -968,6 +1008,45 @@ files = [
     {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"},
 ]
 
+[[package]]
+name = "deepeval"
+version = "1.1.6"
+description = "The open-source evaluation framework for LLMs."
+optional = false
+python-versions = "*"
+files = [
+    {file = "deepeval-1.1.6-py3-none-any.whl", hash = "sha256:58d204257645d49b3146d53bfb6dc445021406427c71fe2f7aec8cd7021f9f81"},
+    {file = "deepeval-1.1.6.tar.gz", hash = "sha256:7ced9ecfc038eceafd08e2fb041f2520d594b823fec1e5649cc3b5688d1390c0"},
+]
+
+[package.dependencies]
+docx2txt = ">=0.8,<1.0"
+grpcio = ">=1.63.0,<1.64.0"
+importlib-metadata = ">=6.0.2"
+langchain = "*"
+langchain-core = "*"
+langchain-openai = "*"
+opentelemetry-api = ">=1.24.0,<1.25.0"
+opentelemetry-exporter-otlp-proto-grpc = ">=1.24.0,<1.25.0"
+opentelemetry-sdk = ">=1.24.0,<1.25.0"
+portalocker = "*"
+protobuf = "*"
+pydantic = "*"
+pytest = "*"
+pytest-repeat = "*"
+pytest-xdist = "*"
+ragas = "*"
+requests = "*"
+rich = "*"
+sentry-sdk = "*"
+tabulate = "*"
+tenacity = ">=8.4.1,<8.5.0"
+tqdm = "*"
+typer = "*"
+
+[package.extras]
+dev = ["black"]
+
 [[package]]
 name = "defusedxml"
 version = "0.7.1"
@@ -979,6 +1058,23 @@ files = [
     {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
 ]
 
+[[package]]
+name = "deprecated"
+version = "1.2.14"
+description = "Python @deprecated decorator to deprecate old python classes, functions or methods."
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "Deprecated-1.2.14-py2.py3-none-any.whl", hash = "sha256:6fac8b097794a90302bdbb17b9b815e732d3c4720583ff1b198499d78470466c"},
+    {file = "Deprecated-1.2.14.tar.gz", hash = "sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3"},
+]
+
+[package.dependencies]
+wrapt = ">=1.10,<2"
+
+[package.extras]
+dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
+
 [[package]]
 name = "dill"
 version = "0.3.8"
@@ -1027,6 +1123,16 @@ files = [
     {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"},
 ]
 
+[[package]]
+name = "docx2txt"
+version = "0.8"
+description = "A pure python-based utility to extract text and images from docx files."
+optional = false
+python-versions = "*"
+files = [
+    {file = "docx2txt-0.8.tar.gz", hash = "sha256:2c06d98d7cfe2d3947e5760a57d924e3ff07745b379c8737723922e7009236e5"},
+]
+
 [[package]]
 name = "dspy-ai"
 version = "2.4.13"
@@ -1067,6 +1173,20 @@ qdrant = ["fastembed", "qdrant-client"]
 snowflake = ["snowflake-snowpark-python"]
 weaviate = ["weaviate-client (>=4.6.5,<4.7.0)"]
 
+[[package]]
+name = "execnet"
+version = "2.1.1"
+description = "execnet: rapid multi-Python deployment"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "execnet-2.1.1-py3-none-any.whl", hash = "sha256:26dee51f1b80cebd6d0ca8e74dd8745419761d3bef34163928cbebbdc4749fdc"},
+    {file = "execnet-2.1.1.tar.gz", hash = "sha256:5189b52c6121c24feae288166ab41b32549c7e2348652736540b9e6e7d4e72e3"},
+]
+
+[package.extras]
+testing = ["hatch", "pre-commit", "pytest", "tox"]
+
 [[package]]
 name = "executing"
 version = "2.0.1"
@@ -1081,6 +1201,45 @@ files = [
 [package.extras]
 tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"]
 
+[[package]]
+name = "faiss-cpu"
+version = "1.8.0.post1"
+description = "A library for efficient similarity search and clustering of dense vectors."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "faiss_cpu-1.8.0.post1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:fd84721eb599aa1da19b1b36345bb8705a60bb1d2887bbbc395a29e3d36a1a62"},
+    {file = "faiss_cpu-1.8.0.post1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b78ff9079d15fd0f156bf5dd8a2975a8abffac1854a86ece263eec1500a2e836"},
+    {file = "faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9de25c943d1789e35fe06a20884c88cd32aedbb1a33bb8da2238cdea7bd9633f"},
+    {file = "faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adae0f1b144e7216da696f14bc4991ca4300c94baaa59247c3d322588e661c95"},
+    {file = "faiss_cpu-1.8.0.post1-cp310-cp310-win_amd64.whl", hash = "sha256:00345290680a444a4b4cb2d98a3844bb5c401a2160fee547c7631d759fd2ec3e"},
+    {file = "faiss_cpu-1.8.0.post1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:8d4bade10cb63e9f9ff261751edd7eb097b1f4bf30be4d0d25d6f688559d795e"},
+    {file = "faiss_cpu-1.8.0.post1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:20bd43eca3b7d77e71ea56b7a558cc28e900d8abff417eb285e2d92e95d934d4"},
+    {file = "faiss_cpu-1.8.0.post1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8542a87743a7f94ac656fd3e9592ad57e58b04d961ad2fe654a22a8ca59defdb"},
+    {file = "faiss_cpu-1.8.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed46928de3dc20170b10fec89c54075a11383c2aaf4f119c63e0f6ae5a507d74"},
+    {file = "faiss_cpu-1.8.0.post1-cp311-cp311-win_amd64.whl", hash = "sha256:4fa5fc8ea210b919aa469e27d6687e50052db906e7fec3f2257178b1384fa18b"},
+    {file = "faiss_cpu-1.8.0.post1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:96aec0d08a3099883af3a9b6356cfe736e8bd879318a940a27e9d1ae6f33d788"},
+    {file = "faiss_cpu-1.8.0.post1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:92b06147fa84732ecdc965922e8ef50dc7011ef8be65821ff4abb2118cb5dce0"},
+    {file = "faiss_cpu-1.8.0.post1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:709ef9394d1148aef70dbe890edbde8c282a4a2e06a8b69ab64f65e90f5ba572"},
+    {file = "faiss_cpu-1.8.0.post1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:327a9c30971bf72cd8392b15eb4aff5d898c453212eae656dfaa3ba555b9ca0c"},
+    {file = "faiss_cpu-1.8.0.post1-cp312-cp312-win_amd64.whl", hash = "sha256:8756f1d93faba56349883fa2f5d47fe36bb2f11f789200c6b1c691ef805485f2"},
+    {file = "faiss_cpu-1.8.0.post1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:f4a3045909c447bf1955b70083891e80f2c87c5427f20cae25245e08ec5c9e52"},
+    {file = "faiss_cpu-1.8.0.post1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8842b7fc921ca1fafdb0845f2ba029e79df04eebae72ab135239f93478a9b7a2"},
+    {file = "faiss_cpu-1.8.0.post1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d5a9799634e32c3862d5436d1e78112ed9a38f319e4523f5916e55d86adda8f"},
+    {file = "faiss_cpu-1.8.0.post1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a70923b0fbbb40f647e20bcbcbfd472277e6d84bb23ff12d2a94b6841806b55"},
+    {file = "faiss_cpu-1.8.0.post1-cp38-cp38-win_amd64.whl", hash = "sha256:ce652df3c4dd50c88ac9235d072f30ce60694dc422c5f523bbbcab320e8f3097"},
+    {file = "faiss_cpu-1.8.0.post1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:83ef04b17b19189dd6601a941bdf4bfa9de0740dbcd80305aeba51a1b1955f80"},
+    {file = "faiss_cpu-1.8.0.post1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c50c8697077470ede7f1939ef8dc8a846ec19cf1893b543f6b67f9af03b0a122"},
+    {file = "faiss_cpu-1.8.0.post1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98ce428a7a67fe5c64047280e5e12a8dbdecf7002f9d127b26cf1db354e9fe76"},
+    {file = "faiss_cpu-1.8.0.post1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f3b36b80380bae523e3198cfb4a137867055945ce7bf10d18fe9f0284f2fb47"},
+    {file = "faiss_cpu-1.8.0.post1-cp39-cp39-win_amd64.whl", hash = "sha256:4fcc67a2353f08a20c1ab955de3cde14ef3b447761b26244a5aa849c15cbc9b3"},
+    {file = "faiss_cpu-1.8.0.post1.tar.gz", hash = "sha256:5686af34414678c3d49c4fa8d774df7156e9cb48d7029071e56230e74b01cc13"},
+]
+
+[package.dependencies]
+numpy = ">=1.0,<2.0"
+packaging = "*"
+
 [[package]]
 name = "fastavro"
 version = "1.9.5"
@@ -1380,6 +1539,23 @@ tqdm = "*"
 [package.extras]
 test = ["build", "mypy", "pytest", "pytest-xdist", "ruff", "twine", "types-requests", "types-setuptools"]
 
+[[package]]
+name = "googleapis-common-protos"
+version = "1.65.0"
+description = "Common protobufs used in Google APIs"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "googleapis_common_protos-1.65.0-py2.py3-none-any.whl", hash = "sha256:2972e6c496f435b92590fd54045060867f3fe9be2c82ab148fc8885035479a63"},
+    {file = "googleapis_common_protos-1.65.0.tar.gz", hash = "sha256:334a29d07cddc3aa01dee4988f9afd9b2916ee2ff49d6b757155dc0d197852c0"},
+]
+
+[package.dependencies]
+protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
+
+[package.extras]
+grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
+
 [[package]]
 name = "graphviz"
 version = "0.20.3"
@@ -1488,61 +1664,61 @@ typing-extensions = ">=4.7,<5"
 
 [[package]]
 name = "grpcio"
-version = "1.65.4"
+version = "1.63.2"
 description = "HTTP/2-based RPC framework"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "grpcio-1.65.4-cp310-cp310-linux_armv7l.whl", hash = "sha256:0e85c8766cf7f004ab01aff6a0393935a30d84388fa3c58d77849fcf27f3e98c"},
-    {file = "grpcio-1.65.4-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:e4a795c02405c7dfa8affd98c14d980f4acea16ea3b539e7404c645329460e5a"},
-    {file = "grpcio-1.65.4-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:d7b984a8dd975d949c2042b9b5ebcf297d6d5af57dcd47f946849ee15d3c2fb8"},
-    {file = "grpcio-1.65.4-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:644a783ce604a7d7c91412bd51cf9418b942cf71896344b6dc8d55713c71ce82"},
-    {file = "grpcio-1.65.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5764237d751d3031a36fafd57eb7d36fd2c10c658d2b4057c516ccf114849a3e"},
-    {file = "grpcio-1.65.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ee40d058cf20e1dd4cacec9c39e9bce13fedd38ce32f9ba00f639464fcb757de"},
-    {file = "grpcio-1.65.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4482a44ce7cf577a1f8082e807a5b909236bce35b3e3897f839f2fbd9ae6982d"},
-    {file = "grpcio-1.65.4-cp310-cp310-win32.whl", hash = "sha256:66bb051881c84aa82e4f22d8ebc9d1704b2e35d7867757f0740c6ef7b902f9b1"},
-    {file = "grpcio-1.65.4-cp310-cp310-win_amd64.whl", hash = "sha256:870370524eff3144304da4d1bbe901d39bdd24f858ce849b7197e530c8c8f2ec"},
-    {file = "grpcio-1.65.4-cp311-cp311-linux_armv7l.whl", hash = "sha256:85e9c69378af02e483bc626fc19a218451b24a402bdf44c7531e4c9253fb49ef"},
-    {file = "grpcio-1.65.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2bd672e005afab8bf0d6aad5ad659e72a06dd713020554182a66d7c0c8f47e18"},
-    {file = "grpcio-1.65.4-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:abccc5d73f5988e8f512eb29341ed9ced923b586bb72e785f265131c160231d8"},
-    {file = "grpcio-1.65.4-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:886b45b29f3793b0c2576201947258782d7e54a218fe15d4a0468d9a6e00ce17"},
-    {file = "grpcio-1.65.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be952436571dacc93ccc7796db06b7daf37b3b56bb97e3420e6503dccfe2f1b4"},
-    {file = "grpcio-1.65.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:8dc9ddc4603ec43f6238a5c95400c9a901b6d079feb824e890623da7194ff11e"},
-    {file = "grpcio-1.65.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ade1256c98cba5a333ef54636095f2c09e6882c35f76acb04412f3b1aa3c29a5"},
-    {file = "grpcio-1.65.4-cp311-cp311-win32.whl", hash = "sha256:280e93356fba6058cbbfc6f91a18e958062ef1bdaf5b1caf46c615ba1ae71b5b"},
-    {file = "grpcio-1.65.4-cp311-cp311-win_amd64.whl", hash = "sha256:d2b819f9ee27ed4e3e737a4f3920e337e00bc53f9e254377dd26fc7027c4d558"},
-    {file = "grpcio-1.65.4-cp312-cp312-linux_armv7l.whl", hash = "sha256:926a0750a5e6fb002542e80f7fa6cab8b1a2ce5513a1c24641da33e088ca4c56"},
-    {file = "grpcio-1.65.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:2a1d4c84d9e657f72bfbab8bedf31bdfc6bfc4a1efb10b8f2d28241efabfaaf2"},
-    {file = "grpcio-1.65.4-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:17de4fda50967679677712eec0a5c13e8904b76ec90ac845d83386b65da0ae1e"},
-    {file = "grpcio-1.65.4-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3dee50c1b69754a4228e933696408ea87f7e896e8d9797a3ed2aeed8dbd04b74"},
-    {file = "grpcio-1.65.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:74c34fc7562bdd169b77966068434a93040bfca990e235f7a67cdf26e1bd5c63"},
-    {file = "grpcio-1.65.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:24a2246e80a059b9eb981e4c2a6d8111b1b5e03a44421adbf2736cc1d4988a8a"},
-    {file = "grpcio-1.65.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:18c10f0d054d2dce34dd15855fcca7cc44ec3b811139437543226776730c0f28"},
-    {file = "grpcio-1.65.4-cp312-cp312-win32.whl", hash = "sha256:d72962788b6c22ddbcdb70b10c11fbb37d60ae598c51eb47ec019db66ccfdff0"},
-    {file = "grpcio-1.65.4-cp312-cp312-win_amd64.whl", hash = "sha256:7656376821fed8c89e68206a522522317787a3d9ed66fb5110b1dff736a5e416"},
-    {file = "grpcio-1.65.4-cp38-cp38-linux_armv7l.whl", hash = "sha256:4934077b33aa6fe0b451de8b71dabde96bf2d9b4cb2b3187be86e5adebcba021"},
-    {file = "grpcio-1.65.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:0cef8c919a3359847c357cb4314e50ed1f0cca070f828ee8f878d362fd744d52"},
-    {file = "grpcio-1.65.4-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:a925446e6aa12ca37114840d8550f308e29026cdc423a73da3043fd1603a6385"},
-    {file = "grpcio-1.65.4-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cf53e6247f1e2af93657e62e240e4f12e11ee0b9cef4ddcb37eab03d501ca864"},
-    {file = "grpcio-1.65.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdb34278e4ceb224c89704cd23db0d902e5e3c1c9687ec9d7c5bb4c150f86816"},
-    {file = "grpcio-1.65.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:e6cbdd107e56bde55c565da5fd16f08e1b4e9b0674851d7749e7f32d8645f524"},
-    {file = "grpcio-1.65.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:626319a156b1f19513156a3b0dbfe977f5f93db63ca673a0703238ebd40670d7"},
-    {file = "grpcio-1.65.4-cp38-cp38-win32.whl", hash = "sha256:3d1bbf7e1dd1096378bd83c83f554d3b93819b91161deaf63e03b7022a85224a"},
-    {file = "grpcio-1.65.4-cp38-cp38-win_amd64.whl", hash = "sha256:a99e6dffefd3027b438116f33ed1261c8d360f0dd4f943cb44541a2782eba72f"},
-    {file = "grpcio-1.65.4-cp39-cp39-linux_armv7l.whl", hash = "sha256:874acd010e60a2ec1e30d5e505b0651ab12eb968157cd244f852b27c6dbed733"},
-    {file = "grpcio-1.65.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b07f36faf01fca5427d4aa23645e2d492157d56c91fab7e06fe5697d7e171ad4"},
-    {file = "grpcio-1.65.4-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:b81711bf4ec08a3710b534e8054c7dcf90f2edc22bebe11c1775a23f145595fe"},
-    {file = "grpcio-1.65.4-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88fcabc332a4aef8bcefadc34a02e9ab9407ab975d2c7d981a8e12c1aed92aa1"},
-    {file = "grpcio-1.65.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c9ba3e63108a8749994f02c7c0e156afb39ba5bdf755337de8e75eb685be244b"},
-    {file = "grpcio-1.65.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8eb485801957a486bf5de15f2c792d9f9c897a86f2f18db8f3f6795a094b4bb2"},
-    {file = "grpcio-1.65.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:075f3903bc1749ace93f2b0664f72964ee5f2da5c15d4b47e0ab68e4f442c257"},
-    {file = "grpcio-1.65.4-cp39-cp39-win32.whl", hash = "sha256:0a0720299bdb2cc7306737295d56e41ce8827d5669d4a3cd870af832e3b17c4d"},
-    {file = "grpcio-1.65.4-cp39-cp39-win_amd64.whl", hash = "sha256:a146bc40fa78769f22e1e9ff4f110ef36ad271b79707577bf2a31e3e931141b9"},
-    {file = "grpcio-1.65.4.tar.gz", hash = "sha256:2a4f476209acffec056360d3e647ae0e14ae13dcf3dfb130c227ae1c594cbe39"},
+    {file = "grpcio-1.63.2-cp310-cp310-linux_armv7l.whl", hash = "sha256:bfb7443a525a0ccc8ae89d29d5257a895fe33af23ba8be21609138cef42deb79"},
+    {file = "grpcio-1.63.2-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:4410f179181961c043c58454ee9cb28474ab38080a1f12e56bac45dc1cf21491"},
+    {file = "grpcio-1.63.2-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:2eded368fbb4d31356d4082ad9b70e617fe2a5b39c4b2817dd9f2478084443fa"},
+    {file = "grpcio-1.63.2-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c97067b7b88dab5b546bc5c29fed202a7f4c7df0f4303518c16b85942cd3db80"},
+    {file = "grpcio-1.63.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2abe769d62d76d2ca4c7114dfb16e5dc608325bc97a60ce33e6fb97add670d42"},
+    {file = "grpcio-1.63.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:18882622160344e8943e31f8db69d047bd4d8e61ed4359b83b59d27dccb0580d"},
+    {file = "grpcio-1.63.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4a98a7df27aabb7fb1a5e3d7156ce5a2b52bb1881d66b838b53b02b122599901"},
+    {file = "grpcio-1.63.2-cp310-cp310-win32.whl", hash = "sha256:5c875f00f963e53339f89e0ed73c5e16f16a8d80ac7634959b8e8ab8f5e58e19"},
+    {file = "grpcio-1.63.2-cp310-cp310-win_amd64.whl", hash = "sha256:80ea604527c7f565b4ab8af8cbea163395842eb6a6d63fc202285d351fad264b"},
+    {file = "grpcio-1.63.2-cp311-cp311-linux_armv7l.whl", hash = "sha256:b01260e2f07ccd39e2a683b99001c124a614b3d081669c1bac5a810d9a0e36dd"},
+    {file = "grpcio-1.63.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:238f00823db78c20888e3510eb5872258b2c450b550b7bc51d1a2c25026a3964"},
+    {file = "grpcio-1.63.2-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:87303d538ac51ef49cd0ad70f1b8aac852797a1ca7fcfc07920d6f9c9df58c57"},
+    {file = "grpcio-1.63.2-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2965c0cc6f21ec26f2ccfacfc89af25338d5cc4b838ca60baa27f7330a074dbd"},
+    {file = "grpcio-1.63.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8bb39a0ebaca1b9caa0d7c766a3b7875d619baa8df9899b025cd067dc7ab90d2"},
+    {file = "grpcio-1.63.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7fbe809d69695e4776a0800625eccdf2e9c770c454a51a17ee7b9a583725ce42"},
+    {file = "grpcio-1.63.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:160245acf4e360e40618989abc8e4beef976286af287fff9fb4c3a5be6dbcb4a"},
+    {file = "grpcio-1.63.2-cp311-cp311-win32.whl", hash = "sha256:80233c2370eb8fe5601cf307eacc8762d29548f216d3de6376f2b388d5c75c71"},
+    {file = "grpcio-1.63.2-cp311-cp311-win_amd64.whl", hash = "sha256:38f53edf277df89175176b26d069c06b8b88ce711aeb6521966a2bdd20eacf2b"},
+    {file = "grpcio-1.63.2-cp312-cp312-linux_armv7l.whl", hash = "sha256:0daaba8884d4fe833e26b10490f793405165e81ae933f5e47091469bc91d94a8"},
+    {file = "grpcio-1.63.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:d32061a7b1fe28658add39276425795131d0a50f5eaf4beaf6beb8151d06072c"},
+    {file = "grpcio-1.63.2-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:d02dc76d3bcc61ffb6629ecbb26f8a9a3171e4446ad5ee8a0298b173cc7eaab6"},
+    {file = "grpcio-1.63.2-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d4390eb43e9570183be2570a98dbcdd01a322d78e6c519875eec31e03d7dd209"},
+    {file = "grpcio-1.63.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f071ae332df667737a2f43a8f66100971ac5b0ab277741f29237764239aa10bb"},
+    {file = "grpcio-1.63.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:496efbed750f769206396a45e767eb351e7e99b02d6a8ff40989308bbc878820"},
+    {file = "grpcio-1.63.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9a9cb1d4436f9e72c73b3d2b9c0ca97629e2e35653f34fd70f8e39aa44e20dbb"},
+    {file = "grpcio-1.63.2-cp312-cp312-win32.whl", hash = "sha256:64f76e867a9b4a6bfa1fc4ab5200b0abffffdaf49eab25c48aef9a06e91de8b4"},
+    {file = "grpcio-1.63.2-cp312-cp312-win_amd64.whl", hash = "sha256:79cb9c53bb935983905672b56fa63e7ef61bf41e54c4b65b021fc6502eb361f5"},
+    {file = "grpcio-1.63.2-cp38-cp38-linux_armv7l.whl", hash = "sha256:6d0c8805291395be646e732b5d01a64cf9f1961aa45cb088257b0ed599b3d873"},
+    {file = "grpcio-1.63.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b7eed646f7733cb5171d6866b8948c21ab3c4a77e128e27b2fd5608d30a18120"},
+    {file = "grpcio-1.63.2-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:96a36eb55619ea4c98d8801d069598fb84b4669e647e82c1db0b3e8c46be0154"},
+    {file = "grpcio-1.63.2-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7df59e2c9ff308995e30c8c6844f101b7e3399420f7a34728a12fa909050033"},
+    {file = "grpcio-1.63.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0817d25213c5f1528a77c4be97ca9fa690813396fe7cb0bf06391ec12e4928e"},
+    {file = "grpcio-1.63.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0ffb8a4fa9e7d2cdec65be78409de919fbb6ac01d6ade26b8b9b013ee67ec04c"},
+    {file = "grpcio-1.63.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ab99b190cb8e4a7692cc1edf76fd9059b216e7aacc6aeb40084c2fed44d323f9"},
+    {file = "grpcio-1.63.2-cp38-cp38-win32.whl", hash = "sha256:57fdcd69ab28e22c33baac2631fb99ecff02c1ce7e4e83dc473b50cb14997725"},
+    {file = "grpcio-1.63.2-cp38-cp38-win_amd64.whl", hash = "sha256:ee2d4cf1672a6711a7cb19b937baa7865cc523ec5a8133a0b1e95fe02d32adff"},
+    {file = "grpcio-1.63.2-cp39-cp39-linux_armv7l.whl", hash = "sha256:b71784130b8af4265566333c3e42c00f4ac13471eb3f4054a85a9df326fa0b50"},
+    {file = "grpcio-1.63.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:fa9e3c0ebd8d3788d4157041112bbc6f455717d485c0f756d5aaefe5d032ae85"},
+    {file = "grpcio-1.63.2-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:996069d2b37d13e0f96d282abb41e8fabbe8f46971d38d71f318ca8316aca189"},
+    {file = "grpcio-1.63.2-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:72f86cdd5643732d394f15a79196fee406030dbf8545161e6a6b5e1ce60376bb"},
+    {file = "grpcio-1.63.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4de14801be123c7e6680f63aa33422b0d1bacff9559fc75cf5e9e78ee21ee235"},
+    {file = "grpcio-1.63.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:a30916b3fd2e851a481fe4a9a7ef4a957c548440bf5020363115712a4ae94ed3"},
+    {file = "grpcio-1.63.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:db66c5cc44729ef61bf680759d9984b47bbdc700f817801d3d2cd3ec7022bb06"},
+    {file = "grpcio-1.63.2-cp39-cp39-win32.whl", hash = "sha256:973d0c97ccdedd7f58df59288ac41abc9ae373e25abe6ef1c1c4265f2518a71d"},
+    {file = "grpcio-1.63.2-cp39-cp39-win_amd64.whl", hash = "sha256:37c7fd664d336ae7c1dd0a9d23b8f31b6efc38dcbe6ce357079ec585bb171ae7"},
+    {file = "grpcio-1.63.2.tar.gz", hash = "sha256:8dccf9777b4f21cad0a8c84916ae9bbdde7ccd10efdec7d95f31805a41801064"},
 ]
 
 [package.extras]
-protobuf = ["grpcio-tools (>=1.65.4)"]
+protobuf = ["grpcio-tools (>=1.63.2)"]
 
 [[package]]
 name = "h11"
@@ -1670,6 +1846,36 @@ files = [
     {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
 ]
 
+[[package]]
+name = "importlib-metadata"
+version = "7.0.0"
+description = "Read metadata from Python packages"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "importlib_metadata-7.0.0-py3-none-any.whl", hash = "sha256:d97503976bb81f40a193d41ee6570868479c69d5068651eb039c40d850c59d67"},
+    {file = "importlib_metadata-7.0.0.tar.gz", hash = "sha256:7fc841f8b8332803464e5dc1c63a2e59121f46ca186c0e2e182e80bf8c1319f7"},
+]
+
+[package.dependencies]
+zipp = ">=0.5"
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"]
+perf = ["ipython"]
+testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)", "pytest-ruff"]
+
+[[package]]
+name = "iniconfig"
+version = "2.0.0"
+description = "brain-dead simple config-ini parsing"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
+    {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
+]
+
 [[package]]
 name = "ipykernel"
 version = "6.29.5"
@@ -1928,6 +2134,20 @@ files = [
 [package.dependencies]
 attrs = ">=19.2.0"
 
+[[package]]
+name = "jsonpatch"
+version = "1.33"
+description = "Apply JSON-Patches (RFC 6902)"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
+files = [
+    {file = "jsonpatch-1.33-py2.py3-none-any.whl", hash = "sha256:0ae28c0cd062bbd8b8ecc26d7d164fbbea9652a1a3693f3b956c1eae5145dade"},
+    {file = "jsonpatch-1.33.tar.gz", hash = "sha256:9fcd4009c41e6d12348b4a0ff2563ba56a2923a7dfee731d004e212e1ee5030c"},
+]
+
+[package.dependencies]
+jsonpointer = ">=1.9"
+
 [[package]]
 name = "jsonpickle"
 version = "3.2.2"
@@ -2371,6 +2591,131 @@ files = [
     {file = "kiwisolver-1.4.5.tar.gz", hash = "sha256:e57e563a57fb22a142da34f38acc2fc1a5c864bc29ca1517a88abc963e60d6ec"},
 ]
 
+[[package]]
+name = "langchain"
+version = "0.2.16"
+description = "Building applications with LLMs through composability"
+optional = false
+python-versions = "<4.0,>=3.8.1"
+files = [
+    {file = "langchain-0.2.16-py3-none-any.whl", hash = "sha256:8f59ee8b45f268df4b924ea3b9c63e49286efa756d16b3f6a9de5c6e502c36e1"},
+    {file = "langchain-0.2.16.tar.gz", hash = "sha256:ffb426a76a703b73ac69abad77cd16eaf03dda76b42cff55572f592d74944166"},
+]
+
+[package.dependencies]
+aiohttp = ">=3.8.3,<4.0.0"
+langchain-core = ">=0.2.38,<0.3.0"
+langchain-text-splitters = ">=0.2.0,<0.3.0"
+langsmith = ">=0.1.17,<0.2.0"
+numpy = [
+    {version = ">=1,<2", markers = "python_version < \"3.12\""},
+    {version = ">=1.26.0,<2.0.0", markers = "python_version >= \"3.12\""},
+]
+pydantic = ">=1,<3"
+PyYAML = ">=5.3"
+requests = ">=2,<3"
+SQLAlchemy = ">=1.4,<3"
+tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0"
+
+[[package]]
+name = "langchain-community"
+version = "0.2.16"
+description = "Community contributed LangChain integrations."
+optional = false
+python-versions = "<4.0,>=3.8.1"
+files = [
+    {file = "langchain_community-0.2.16-py3-none-any.whl", hash = "sha256:115e1419c176091d4e00240cb5a38612a249e70f213516b6cacae61a8794a868"},
+    {file = "langchain_community-0.2.16.tar.gz", hash = "sha256:ab416b793a7aed1fa46ebaffd29993296b02286a99819eabc43be2ea8e41ae78"},
+]
+
+[package.dependencies]
+aiohttp = ">=3.8.3,<4.0.0"
+dataclasses-json = ">=0.5.7,<0.7"
+langchain = ">=0.2.16,<0.3.0"
+langchain-core = ">=0.2.38,<0.3.0"
+langsmith = ">=0.1.0,<0.2.0"
+numpy = [
+    {version = ">=1,<2", markers = "python_version < \"3.12\""},
+    {version = ">=1.26.0,<2.0.0", markers = "python_version >= \"3.12\""},
+]
+PyYAML = ">=5.3"
+requests = ">=2,<3"
+SQLAlchemy = ">=1.4,<3"
+tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0"
+
+[[package]]
+name = "langchain-core"
+version = "0.2.38"
+description = "Building applications with LLMs through composability"
+optional = false
+python-versions = "<4.0,>=3.8.1"
+files = [
+    {file = "langchain_core-0.2.38-py3-none-any.whl", hash = "sha256:8a5729bc7e68b4af089af20eff44fe4e7ca21d0e0c87ec21cef7621981fd1a4a"},
+    {file = "langchain_core-0.2.38.tar.gz", hash = "sha256:eb69dbedd344f2ee1f15bcea6c71a05884b867588fadc42d04632e727c1238f3"},
+]
+
+[package.dependencies]
+jsonpatch = ">=1.33,<2.0"
+langsmith = ">=0.1.75,<0.2.0"
+packaging = ">=23.2,<25"
+pydantic = [
+    {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""},
+    {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
+]
+PyYAML = ">=5.3"
+tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0"
+typing-extensions = ">=4.7"
+
+[[package]]
+name = "langchain-openai"
+version = "0.1.23"
+description = "An integration package connecting OpenAI and LangChain"
+optional = false
+python-versions = "<4.0,>=3.8.1"
+files = [
+    {file = "langchain_openai-0.1.23-py3-none-any.whl", hash = "sha256:8e3d215803e157f26480c6108eb4333629832b1a0e746723060c24f93b8b78f4"},
+    {file = "langchain_openai-0.1.23.tar.gz", hash = "sha256:ed7f16671ea0af177ac5f82d5645a746c5097c56f97b31798e5c07b5c84f0eed"},
+]
+
+[package.dependencies]
+langchain-core = ">=0.2.35,<0.3.0"
+openai = ">=1.40.0,<2.0.0"
+tiktoken = ">=0.7,<1"
+
+[[package]]
+name = "langchain-text-splitters"
+version = "0.2.4"
+description = "LangChain text splitting utilities"
+optional = false
+python-versions = "<4.0,>=3.8.1"
+files = [
+    {file = "langchain_text_splitters-0.2.4-py3-none-any.whl", hash = "sha256:2702dee5b7cbdd595ccbe43b8d38d01a34aa8583f4d6a5a68ad2305ae3e7b645"},
+    {file = "langchain_text_splitters-0.2.4.tar.gz", hash = "sha256:f7daa7a3b0aa8309ce248e2e2b6fc8115be01118d336c7f7f7dfacda0e89bf29"},
+]
+
+[package.dependencies]
+langchain-core = ">=0.2.38,<0.3.0"
+
+[[package]]
+name = "langsmith"
+version = "0.1.114"
+description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
+optional = false
+python-versions = "<4.0,>=3.8.1"
+files = [
+    {file = "langsmith-0.1.114-py3-none-any.whl", hash = "sha256:2b6b6b49ddb1cea75f465da107ddc21e60d3c7242813dcc0de90f914e4957249"},
+    {file = "langsmith-0.1.114.tar.gz", hash = "sha256:1683e1505d034d1bf7c960067c1357fd0d294172dd20540f913093e4b86857a2"},
+]
+
+[package.dependencies]
+httpx = ">=0.23.0,<1"
+orjson = ">=3.9.14,<4.0.0"
+pydantic = [
+    {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""},
+    {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
+]
+requests = ">=2,<3"
+
 [[package]]
 name = "lightning-utilities"
 version = "0.11.6"
@@ -2426,6 +2771,30 @@ files = [
 docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.5)", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-nature (>=0.6)", "mkdocs-section-index", "mkdocstrings[python]"]
 testing = ["coverage", "pyyaml"]
 
+[[package]]
+name = "markdown-it-py"
+version = "3.0.0"
+description = "Python port of markdown-it. Markdown parsing, done right!"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"},
+    {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"},
+]
+
+[package.dependencies]
+mdurl = ">=0.1,<1.0"
+
+[package.extras]
+benchmarking = ["psutil", "pytest", "pytest-benchmark"]
+code-style = ["pre-commit (>=3.0,<4.0)"]
+compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"]
+linkify = ["linkify-it-py (>=1,<3)"]
+plugins = ["mdit-py-plugins"]
+profiling = ["gprof2dot"]
+rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"]
+testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
+
 [[package]]
 name = "markupsafe"
 version = "2.1.5"
@@ -2495,6 +2864,25 @@ files = [
     {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"},
 ]
 
+[[package]]
+name = "marshmallow"
+version = "3.22.0"
+description = "A lightweight library for converting complex datatypes to and from native Python datatypes."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "marshmallow-3.22.0-py3-none-any.whl", hash = "sha256:71a2dce49ef901c3f97ed296ae5051135fd3febd2bf43afe0ae9a82143a494d9"},
+    {file = "marshmallow-3.22.0.tar.gz", hash = "sha256:4972f529104a220bb8637d595aa4c9762afbe7f7a77d82dc58c1615d70c5823e"},
+]
+
+[package.dependencies]
+packaging = ">=17.0"
+
+[package.extras]
+dev = ["marshmallow[tests]", "pre-commit (>=3.5,<4.0)", "tox"]
+docs = ["alabaster (==1.0.0)", "autodocsumm (==0.2.13)", "sphinx (==8.0.2)", "sphinx-issues (==4.1.0)", "sphinx-version-warning (==1.1.2)"]
+tests = ["pytest", "pytz", "simplejson"]
+
 [[package]]
 name = "matplotlib"
 version = "3.9.1.post1"
@@ -2561,6 +2949,17 @@ files = [
 [package.dependencies]
 traitlets = "*"
 
+[[package]]
+name = "mdurl"
+version = "0.1.2"
+description = "Markdown URL utilities"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
+    {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
+]
+
 [[package]]
 name = "mistune"
 version = "3.0.2"
@@ -2712,6 +3111,17 @@ files = [
 [package.dependencies]
 dill = ">=0.3.8"
 
+[[package]]
+name = "mypy-extensions"
+version = "1.0.0"
+description = "Type system extensions for programs checked with the mypy type checker."
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"},
+    {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
+]
+
 [[package]]
 name = "nbclient"
 version = "0.10.0"
@@ -2821,6 +3231,31 @@ doc = ["myst-nb (>=1.0)", "numpydoc (>=1.7)", "pillow (>=9.4)", "pydata-sphinx-t
 extra = ["lxml (>=4.6)", "pydot (>=2.0)", "pygraphviz (>=1.12)", "sympy (>=1.10)"]
 test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"]
 
+[[package]]
+name = "nltk"
+version = "3.9.1"
+description = "Natural Language Toolkit"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "nltk-3.9.1-py3-none-any.whl", hash = "sha256:4fa26829c5b00715afe3061398a8989dc643b92ce7dd93fb4585a70930d168a1"},
+    {file = "nltk-3.9.1.tar.gz", hash = "sha256:87d127bd3de4bd89a4f81265e5fa59cb1b199b27440175370f7417d2bc7ae868"},
+]
+
+[package.dependencies]
+click = "*"
+joblib = "*"
+regex = ">=2021.8.3"
+tqdm = "*"
+
+[package.extras]
+all = ["matplotlib", "numpy", "pyparsing", "python-crfsuite", "requests", "scikit-learn", "scipy", "twython"]
+corenlp = ["requests"]
+machine-learning = ["numpy", "python-crfsuite", "scikit-learn", "scipy"]
+plot = ["matplotlib"]
+tgrep = ["pyparsing"]
+twitter = ["twython"]
+
 [[package]]
 name = "nodeenv"
 version = "1.9.1"
@@ -3099,6 +3534,99 @@ typing-extensions = ">=4.11,<5"
 [package.extras]
 datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
 
+[[package]]
+name = "opentelemetry-api"
+version = "1.24.0"
+description = "OpenTelemetry Python API"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "opentelemetry_api-1.24.0-py3-none-any.whl", hash = "sha256:0f2c363d98d10d1ce93330015ca7fd3a65f60be64e05e30f557c61de52c80ca2"},
+    {file = "opentelemetry_api-1.24.0.tar.gz", hash = "sha256:42719f10ce7b5a9a73b10a4baf620574fb8ad495a9cbe5c18d76b75d8689c67e"},
+]
+
+[package.dependencies]
+deprecated = ">=1.2.6"
+importlib-metadata = ">=6.0,<=7.0"
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-common"
+version = "1.24.0"
+description = "OpenTelemetry Protobuf encoding"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "opentelemetry_exporter_otlp_proto_common-1.24.0-py3-none-any.whl", hash = "sha256:e51f2c9735054d598ad2df5d3eca830fecfb5b0bda0a2fa742c9c7718e12f641"},
+    {file = "opentelemetry_exporter_otlp_proto_common-1.24.0.tar.gz", hash = "sha256:5d31fa1ff976cacc38be1ec4e3279a3f88435c75b38b1f7a099a1faffc302461"},
+]
+
+[package.dependencies]
+opentelemetry-proto = "1.24.0"
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-grpc"
+version = "1.24.0"
+description = "OpenTelemetry Collector Protobuf over gRPC Exporter"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "opentelemetry_exporter_otlp_proto_grpc-1.24.0-py3-none-any.whl", hash = "sha256:f40d62aa30a0a43cc1657428e59fcf82ad5f7ea8fff75de0f9d9cb6f739e0a3b"},
+    {file = "opentelemetry_exporter_otlp_proto_grpc-1.24.0.tar.gz", hash = "sha256:217c6e30634f2c9797999ea9da29f7300479a94a610139b9df17433f915e7baa"},
+]
+
+[package.dependencies]
+deprecated = ">=1.2.6"
+googleapis-common-protos = ">=1.52,<2.0"
+grpcio = ">=1.0.0,<2.0.0"
+opentelemetry-api = ">=1.15,<2.0"
+opentelemetry-exporter-otlp-proto-common = "1.24.0"
+opentelemetry-proto = "1.24.0"
+opentelemetry-sdk = ">=1.24.0,<1.25.0"
+
+[package.extras]
+test = ["pytest-grpc"]
+
+[[package]]
+name = "opentelemetry-proto"
+version = "1.24.0"
+description = "OpenTelemetry Python Proto"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "opentelemetry_proto-1.24.0-py3-none-any.whl", hash = "sha256:bcb80e1e78a003040db71ccf83f2ad2019273d1e0828089d183b18a1476527ce"},
+    {file = "opentelemetry_proto-1.24.0.tar.gz", hash = "sha256:ff551b8ad63c6cabb1845ce217a6709358dfaba0f75ea1fa21a61ceddc78cab8"},
+]
+
+[package.dependencies]
+protobuf = ">=3.19,<5.0"
+
+[[package]]
+name = "opentelemetry-sdk"
+version = "1.24.0"
+description = "OpenTelemetry Python SDK"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "opentelemetry_sdk-1.24.0-py3-none-any.whl", hash = "sha256:fa731e24efe832e98bcd90902085b359dcfef7d9c9c00eb5b9a18587dae3eb59"},
+    {file = "opentelemetry_sdk-1.24.0.tar.gz", hash = "sha256:75bc0563affffa827700e0f4f4a68e1e257db0df13372344aebc6f8a64cde2e5"},
+]
+
+[package.dependencies]
+opentelemetry-api = "1.24.0"
+opentelemetry-semantic-conventions = "0.45b0"
+typing-extensions = ">=3.7.4"
+
+[[package]]
+name = "opentelemetry-semantic-conventions"
+version = "0.45b0"
+description = "OpenTelemetry Semantic Conventions"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "opentelemetry_semantic_conventions-0.45b0-py3-none-any.whl", hash = "sha256:a4a6fb9a7bacd9167c082aa4681009e9acdbfa28ffb2387af50c2fef3d30c864"},
+    {file = "opentelemetry_semantic_conventions-0.45b0.tar.gz", hash = "sha256:7c84215a44ac846bc4b8e32d5e78935c5c43482e491812a0bb8aaf87e4d92118"},
+]
+
 [[package]]
 name = "optuna"
 version = "3.6.1"
@@ -3126,6 +3654,72 @@ document = ["ase", "cmaes (>=0.10.0)", "fvcore", "lightgbm", "matplotlib (!=3.6.
 optional = ["boto3", "cmaes (>=0.10.0)", "google-cloud-storage", "matplotlib (!=3.6.0)", "pandas", "plotly (>=4.9.0)", "redis", "scikit-learn (>=0.24.2)", "scipy", "torch"]
 test = ["coverage", "fakeredis[lua]", "kaleido", "moto", "pytest", "scipy (>=1.9.2)", "torch"]
 
+[[package]]
+name = "orjson"
+version = "3.10.7"
+description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "orjson-3.10.7-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:74f4544f5a6405b90da8ea724d15ac9c36da4d72a738c64685003337401f5c12"},
+    {file = "orjson-3.10.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34a566f22c28222b08875b18b0dfbf8a947e69df21a9ed5c51a6bf91cfb944ac"},
+    {file = "orjson-3.10.7-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bf6ba8ebc8ef5792e2337fb0419f8009729335bb400ece005606336b7fd7bab7"},
+    {file = "orjson-3.10.7-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac7cf6222b29fbda9e3a472b41e6a5538b48f2c8f99261eecd60aafbdb60690c"},
+    {file = "orjson-3.10.7-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de817e2f5fc75a9e7dd350c4b0f54617b280e26d1631811a43e7e968fa71e3e9"},
+    {file = "orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:348bdd16b32556cf8d7257b17cf2bdb7ab7976af4af41ebe79f9796c218f7e91"},
+    {file = "orjson-3.10.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:479fd0844ddc3ca77e0fd99644c7fe2de8e8be1efcd57705b5c92e5186e8a250"},
+    {file = "orjson-3.10.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:fdf5197a21dd660cf19dfd2a3ce79574588f8f5e2dbf21bda9ee2d2b46924d84"},
+    {file = "orjson-3.10.7-cp310-none-win32.whl", hash = "sha256:d374d36726746c81a49f3ff8daa2898dccab6596864ebe43d50733275c629175"},
+    {file = "orjson-3.10.7-cp310-none-win_amd64.whl", hash = "sha256:cb61938aec8b0ffb6eef484d480188a1777e67b05d58e41b435c74b9d84e0b9c"},
+    {file = "orjson-3.10.7-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:7db8539039698ddfb9a524b4dd19508256107568cdad24f3682d5773e60504a2"},
+    {file = "orjson-3.10.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:480f455222cb7a1dea35c57a67578848537d2602b46c464472c995297117fa09"},
+    {file = "orjson-3.10.7-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8a9c9b168b3a19e37fe2778c0003359f07822c90fdff8f98d9d2a91b3144d8e0"},
+    {file = "orjson-3.10.7-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8de062de550f63185e4c1c54151bdddfc5625e37daf0aa1e75d2a1293e3b7d9a"},
+    {file = "orjson-3.10.7-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6b0dd04483499d1de9c8f6203f8975caf17a6000b9c0c54630cef02e44ee624e"},
+    {file = "orjson-3.10.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b58d3795dafa334fc8fd46f7c5dc013e6ad06fd5b9a4cc98cb1456e7d3558bd6"},
+    {file = "orjson-3.10.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:33cfb96c24034a878d83d1a9415799a73dc77480e6c40417e5dda0710d559ee6"},
+    {file = "orjson-3.10.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e724cebe1fadc2b23c6f7415bad5ee6239e00a69f30ee423f319c6af70e2a5c0"},
+    {file = "orjson-3.10.7-cp311-none-win32.whl", hash = "sha256:82763b46053727a7168d29c772ed5c870fdae2f61aa8a25994c7984a19b1021f"},
+    {file = "orjson-3.10.7-cp311-none-win_amd64.whl", hash = "sha256:eb8d384a24778abf29afb8e41d68fdd9a156cf6e5390c04cc07bbc24b89e98b5"},
+    {file = "orjson-3.10.7-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:44a96f2d4c3af51bfac6bc4ef7b182aa33f2f054fd7f34cc0ee9a320d051d41f"},
+    {file = "orjson-3.10.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76ac14cd57df0572453543f8f2575e2d01ae9e790c21f57627803f5e79b0d3c3"},
+    {file = "orjson-3.10.7-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bdbb61dcc365dd9be94e8f7df91975edc9364d6a78c8f7adb69c1cdff318ec93"},
+    {file = "orjson-3.10.7-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b48b3db6bb6e0a08fa8c83b47bc169623f801e5cc4f24442ab2b6617da3b5313"},
+    {file = "orjson-3.10.7-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23820a1563a1d386414fef15c249040042b8e5d07b40ab3fe3efbfbbcbcb8864"},
+    {file = "orjson-3.10.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0c6a008e91d10a2564edbb6ee5069a9e66df3fbe11c9a005cb411f441fd2c09"},
+    {file = "orjson-3.10.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d352ee8ac1926d6193f602cbe36b1643bbd1bbcb25e3c1a657a4390f3000c9a5"},
+    {file = "orjson-3.10.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d2d9f990623f15c0ae7ac608103c33dfe1486d2ed974ac3f40b693bad1a22a7b"},
+    {file = "orjson-3.10.7-cp312-none-win32.whl", hash = "sha256:7c4c17f8157bd520cdb7195f75ddbd31671997cbe10aee559c2d613592e7d7eb"},
+    {file = "orjson-3.10.7-cp312-none-win_amd64.whl", hash = "sha256:1d9c0e733e02ada3ed6098a10a8ee0052dd55774de3d9110d29868d24b17faa1"},
+    {file = "orjson-3.10.7-cp313-cp313-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:77d325ed866876c0fa6492598ec01fe30e803272a6e8b10e992288b009cbe149"},
+    {file = "orjson-3.10.7-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ea2c232deedcb605e853ae1db2cc94f7390ac776743b699b50b071b02bea6fe"},
+    {file = "orjson-3.10.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3dcfbede6737fdbef3ce9c37af3fb6142e8e1ebc10336daa05872bfb1d87839c"},
+    {file = "orjson-3.10.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:11748c135f281203f4ee695b7f80bb1358a82a63905f9f0b794769483ea854ad"},
+    {file = "orjson-3.10.7-cp313-none-win32.whl", hash = "sha256:a7e19150d215c7a13f39eb787d84db274298d3f83d85463e61d277bbd7f401d2"},
+    {file = "orjson-3.10.7-cp313-none-win_amd64.whl", hash = "sha256:eef44224729e9525d5261cc8d28d6b11cafc90e6bd0be2157bde69a52ec83024"},
+    {file = "orjson-3.10.7-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:6ea2b2258eff652c82652d5e0f02bd5e0463a6a52abb78e49ac288827aaa1469"},
+    {file = "orjson-3.10.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:430ee4d85841e1483d487e7b81401785a5dfd69db5de01314538f31f8fbf7ee1"},
+    {file = "orjson-3.10.7-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4b6146e439af4c2472c56f8540d799a67a81226e11992008cb47e1267a9b3225"},
+    {file = "orjson-3.10.7-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:084e537806b458911137f76097e53ce7bf5806dda33ddf6aaa66a028f8d43a23"},
+    {file = "orjson-3.10.7-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4829cf2195838e3f93b70fd3b4292156fc5e097aac3739859ac0dcc722b27ac0"},
+    {file = "orjson-3.10.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1193b2416cbad1a769f868b1749535d5da47626ac29445803dae7cc64b3f5c98"},
+    {file = "orjson-3.10.7-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:4e6c3da13e5a57e4b3dca2de059f243ebec705857522f188f0180ae88badd354"},
+    {file = "orjson-3.10.7-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c31008598424dfbe52ce8c5b47e0752dca918a4fdc4a2a32004efd9fab41d866"},
+    {file = "orjson-3.10.7-cp38-none-win32.whl", hash = "sha256:7122a99831f9e7fe977dc45784d3b2edc821c172d545e6420c375e5a935f5a1c"},
+    {file = "orjson-3.10.7-cp38-none-win_amd64.whl", hash = "sha256:a763bc0e58504cc803739e7df040685816145a6f3c8a589787084b54ebc9f16e"},
+    {file = "orjson-3.10.7-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:e76be12658a6fa376fcd331b1ea4e58f5a06fd0220653450f0d415b8fd0fbe20"},
+    {file = "orjson-3.10.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed350d6978d28b92939bfeb1a0570c523f6170efc3f0a0ef1f1df287cd4f4960"},
+    {file = "orjson-3.10.7-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:144888c76f8520e39bfa121b31fd637e18d4cc2f115727865fdf9fa325b10412"},
+    {file = "orjson-3.10.7-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09b2d92fd95ad2402188cf51573acde57eb269eddabaa60f69ea0d733e789fe9"},
+    {file = "orjson-3.10.7-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b24a579123fa884f3a3caadaed7b75eb5715ee2b17ab5c66ac97d29b18fe57f"},
+    {file = "orjson-3.10.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e72591bcfe7512353bd609875ab38050efe3d55e18934e2f18950c108334b4ff"},
+    {file = "orjson-3.10.7-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:f4db56635b58cd1a200b0a23744ff44206ee6aa428185e2b6c4a65b3197abdcd"},
+    {file = "orjson-3.10.7-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0fa5886854673222618638c6df7718ea7fe2f3f2384c452c9ccedc70b4a510a5"},
+    {file = "orjson-3.10.7-cp39-none-win32.whl", hash = "sha256:8272527d08450ab16eb405f47e0f4ef0e5ff5981c3d82afe0efd25dcbef2bcd2"},
+    {file = "orjson-3.10.7-cp39-none-win_amd64.whl", hash = "sha256:974683d4618c0c7dbf4f69c95a979734bf183d0658611760017f6e70a145af58"},
+    {file = "orjson-3.10.7.tar.gz", hash = "sha256:75ef0640403f945f3a1f9f6400686560dbfb0fb5b16589ad62cd477043c4eee3"},
+]
+
 [[package]]
 name = "overrides"
 version = "7.7.0"
@@ -3188,8 +3782,8 @@ files = [
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
     {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
 python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
@@ -3400,6 +3994,40 @@ docs = ["furo (>=2023.9.10)", "proselint (>=0.13)", "sphinx (>=7.2.6)", "sphinx-
 test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)"]
 type = ["mypy (>=1.8)"]
 
+[[package]]
+name = "pluggy"
+version = "1.5.0"
+description = "plugin and hook calling mechanisms for python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
+    {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
+]
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+testing = ["pytest", "pytest-benchmark"]
+
+[[package]]
+name = "portalocker"
+version = "2.10.1"
+description = "Wraps the portalocker recipe for easy usage"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "portalocker-2.10.1-py3-none-any.whl", hash = "sha256:53a5984ebc86a025552264b459b46a2086e269b21823cb572f8f28ee759e45bf"},
+    {file = "portalocker-2.10.1.tar.gz", hash = "sha256:ef1bf844e878ab08aee7e40184156e1151f228f103aa5c6bd0724cc330960f8f"},
+]
+
+[package.dependencies]
+pywin32 = {version = ">=226", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+docs = ["sphinx (>=1.7.1)"]
+redis = ["redis"]
+tests = ["pytest (>=5.4.1)", "pytest-cov (>=2.8.1)", "pytest-mypy (>=0.8.0)", "pytest-timeout (>=2.1.0)", "redis", "sphinx (>=6.0.0)", "types-redis"]
+
 [[package]]
 name = "pre-commit"
 version = "3.8.0"
@@ -3608,8 +4236,8 @@ files = [
 annotated-types = ">=0.4.0"
 pydantic-core = "2.20.1"
 typing-extensions = [
-    {version = ">=4.12.2", markers = "python_version >= \"3.13\""},
     {version = ">=4.6.1", markers = "python_version < \"3.13\""},
+    {version = ">=4.12.2", markers = "python_version >= \"3.13\""},
 ]
 
 [package.extras]
@@ -3763,6 +4391,16 @@ files = [
 [package.extras]
 diagrams = ["jinja2", "railroad-diagrams"]
 
+[[package]]
+name = "pysbd"
+version = "0.3.4"
+description = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
+optional = false
+python-versions = ">=3"
+files = [
+    {file = "pysbd-0.3.4-py3-none-any.whl", hash = "sha256:cd838939b7b0b185fcf86b0baf6636667dfb6e474743beeff878e9f42e022953"},
+]
+
 [[package]]
 name = "pysocks"
 version = "1.7.1"
@@ -3775,6 +4413,60 @@ files = [
     {file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"},
 ]
 
+[[package]]
+name = "pytest"
+version = "8.3.2"
+description = "pytest: simple powerful testing with Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pytest-8.3.2-py3-none-any.whl", hash = "sha256:4ba08f9ae7dcf84ded419494d229b48d0903ea6407b030eaec46df5e6a73bba5"},
+    {file = "pytest-8.3.2.tar.gz", hash = "sha256:c132345d12ce551242c87269de812483f5bcc87cdbb4722e48487ba194f9fdce"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+iniconfig = "*"
+packaging = "*"
+pluggy = ">=1.5,<2"
+
+[package.extras]
+dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
+
+[[package]]
+name = "pytest-repeat"
+version = "0.9.3"
+description = "pytest plugin for repeating tests"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytest_repeat-0.9.3-py3-none-any.whl", hash = "sha256:26ab2df18226af9d5ce441c858f273121e92ff55f5bb311d25755b8d7abdd8ed"},
+    {file = "pytest_repeat-0.9.3.tar.gz", hash = "sha256:ffd3836dfcd67bb270bec648b330e20be37d2966448c4148c4092d1e8aba8185"},
+]
+
+[package.dependencies]
+pytest = "*"
+
+[[package]]
+name = "pytest-xdist"
+version = "3.6.1"
+description = "pytest xdist plugin for distributed testing, most importantly across multiple CPUs"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pytest_xdist-3.6.1-py3-none-any.whl", hash = "sha256:9ed4adfb68a016610848639bb7e02c9352d5d9f03d04809919e2dafc3be4cca7"},
+    {file = "pytest_xdist-3.6.1.tar.gz", hash = "sha256:ead156a4db231eec769737f57668ef58a2084a34b2e55c4a8fa20d861107300d"},
+]
+
+[package.dependencies]
+execnet = ">=2.1"
+pytest = ">=7.0.0"
+
+[package.extras]
+psutil = ["psutil (>=3.0)"]
+setproctitle = ["setproctitle"]
+testing = ["filelock"]
+
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@@ -4104,6 +4796,33 @@ packaging = "*"
 [package.extras]
 test = ["pytest (>=6,!=7.0.0,!=7.0.1)", "pytest-cov (>=3.0.0)", "pytest-qt"]
 
+[[package]]
+name = "ragas"
+version = "0.1.16"
+description = ""
+optional = false
+python-versions = "*"
+files = [
+    {file = "ragas-0.1.16-py3-none-any.whl", hash = "sha256:371378dda700fb7cf7ba5e473d280887e51a6248574bc474e5952a05b8312a80"},
+    {file = "ragas-0.1.16.tar.gz", hash = "sha256:23d0d27272fd47d5e6687f1f05461098650c09ad20337e93a0cd7cbfcc2f65b8"},
+]
+
+[package.dependencies]
+appdirs = "*"
+datasets = "*"
+langchain = "*"
+langchain-community = "*"
+langchain-core = "*"
+langchain-openai = "*"
+nest-asyncio = "*"
+numpy = "*"
+openai = ">1"
+pysbd = ">=0.3.4"
+tiktoken = "*"
+
+[package.extras]
+all = ["sentence-transformers", "transformers"]
+
 [[package]]
 name = "referencing"
 version = "0.35.1"
@@ -4254,6 +4973,24 @@ files = [
     {file = "rfc3986_validator-0.1.1.tar.gz", hash = "sha256:3d44bde7921b3b9ec3ae4e3adca370438eccebc676456449b145d533b240d055"},
 ]
 
+[[package]]
+name = "rich"
+version = "13.8.0"
+description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "rich-13.8.0-py3-none-any.whl", hash = "sha256:2e85306a063b9492dffc86278197a60cbece75bcb766022f3436f567cae11bdc"},
+    {file = "rich-13.8.0.tar.gz", hash = "sha256:a5ac1f1cd448ade0d59cc3356f7db7a7ccda2c8cbae9c7a90c28ff463d3e91f4"},
+]
+
+[package.dependencies]
+markdown-it-py = ">=2.2.0"
+pygments = ">=2.13.0,<3.0.0"
+
+[package.extras]
+jupyter = ["ipywidgets (>=7.5.1,<9)"]
+
 [[package]]
 name = "rpds-py"
 version = "0.20.0"
@@ -4531,6 +5268,57 @@ nativelib = ["pyobjc-framework-Cocoa", "pywin32"]
 objc = ["pyobjc-framework-Cocoa"]
 win32 = ["pywin32"]
 
+[[package]]
+name = "sentry-sdk"
+version = "2.13.0"
+description = "Python client for Sentry (https://sentry.io)"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "sentry_sdk-2.13.0-py2.py3-none-any.whl", hash = "sha256:6beede8fc2ab4043da7f69d95534e320944690680dd9a963178a49de71d726c6"},
+    {file = "sentry_sdk-2.13.0.tar.gz", hash = "sha256:8d4a576f7a98eb2fdb40e13106e41f330e5c79d72a68be1316e7852cf4995260"},
+]
+
+[package.dependencies]
+certifi = "*"
+urllib3 = ">=1.26.11"
+
+[package.extras]
+aiohttp = ["aiohttp (>=3.5)"]
+anthropic = ["anthropic (>=0.16)"]
+arq = ["arq (>=0.23)"]
+asyncpg = ["asyncpg (>=0.23)"]
+beam = ["apache-beam (>=2.12)"]
+bottle = ["bottle (>=0.12.13)"]
+celery = ["celery (>=3)"]
+celery-redbeat = ["celery-redbeat (>=2)"]
+chalice = ["chalice (>=1.16.0)"]
+clickhouse-driver = ["clickhouse-driver (>=0.2.0)"]
+django = ["django (>=1.8)"]
+falcon = ["falcon (>=1.4)"]
+fastapi = ["fastapi (>=0.79.0)"]
+flask = ["blinker (>=1.1)", "flask (>=0.11)", "markupsafe"]
+grpcio = ["grpcio (>=1.21.1)", "protobuf (>=3.8.0)"]
+httpx = ["httpx (>=0.16.0)"]
+huey = ["huey (>=2)"]
+huggingface-hub = ["huggingface-hub (>=0.22)"]
+langchain = ["langchain (>=0.0.210)"]
+litestar = ["litestar (>=2.0.0)"]
+loguru = ["loguru (>=0.5)"]
+openai = ["openai (>=1.0.0)", "tiktoken (>=0.3.0)"]
+opentelemetry = ["opentelemetry-distro (>=0.35b0)"]
+opentelemetry-experimental = ["opentelemetry-distro"]
+pure-eval = ["asttokens", "executing", "pure-eval"]
+pymongo = ["pymongo (>=3.1)"]
+pyspark = ["pyspark (>=2.4.4)"]
+quart = ["blinker (>=1.1)", "quart (>=0.16.1)"]
+rq = ["rq (>=0.6)"]
+sanic = ["sanic (>=0.8)"]
+sqlalchemy = ["sqlalchemy (>=1.2)"]
+starlette = ["starlette (>=0.19.1)"]
+starlite = ["starlite (>=1.48)"]
+tornado = ["tornado (>=6)"]
+
 [[package]]
 name = "setuptools"
 version = "72.1.0"
@@ -4547,6 +5335,17 @@ core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.te
 doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
 test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.11.*)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (<0.4)", "pytest-ruff (>=0.2.1)", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
 
+[[package]]
+name = "shellingham"
+version = "1.5.4"
+description = "Tool to Detect Surrounding Shell"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686"},
+    {file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"},
+]
+
 [[package]]
 name = "six"
 version = "1.16.0"
@@ -4720,15 +5519,29 @@ mpmath = ">=1.1.0,<1.4"
 [package.extras]
 dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"]
 
+[[package]]
+name = "tabulate"
+version = "0.9.0"
+description = "Pretty-print tabular data"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"},
+    {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"},
+]
+
+[package.extras]
+widechars = ["wcwidth"]
+
 [[package]]
 name = "tenacity"
-version = "9.0.0"
+version = "8.4.2"
 description = "Retry code until it succeeds"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "tenacity-9.0.0-py3-none-any.whl", hash = "sha256:93de0c98785b27fcf659856aa9f54bfbd399e29969b0621bc7f762bd441b4539"},
-    {file = "tenacity-9.0.0.tar.gz", hash = "sha256:807f37ca97d62aa361264d497b0e31e92b8027044942bfa756160d908320d73b"},
+    {file = "tenacity-8.4.2-py3-none-any.whl", hash = "sha256:9e6f7cf7da729125c7437222f8a522279751cdfbe6b67bfe64f75d3a348661b2"},
+    {file = "tenacity-8.4.2.tar.gz", hash = "sha256:cd80a53a79336edba8489e767f729e4f391c896956b57140b5d7511a64bbd3ef"},
 ]
 
 [package.extras]
@@ -5248,6 +6061,23 @@ build = ["cmake (>=3.20)", "lit"]
 tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"]
 tutorials = ["matplotlib", "pandas", "tabulate"]
 
+[[package]]
+name = "typer"
+version = "0.12.5"
+description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "typer-0.12.5-py3-none-any.whl", hash = "sha256:62fe4e471711b147e3365034133904df3e235698399bc4de2b36c8579298d52b"},
+    {file = "typer-0.12.5.tar.gz", hash = "sha256:f592f089bedcc8ec1b974125d64851029c3b1af145f04aca64d69410f0c9b722"},
+]
+
+[package.dependencies]
+click = ">=8.0.0"
+rich = ">=10.11.0"
+shellingham = ">=1.3.0"
+typing-extensions = ">=3.7.4.3"
+
 [[package]]
 name = "types-python-dateutil"
 version = "2.9.0.20240316"
@@ -5284,6 +6114,21 @@ files = [
     {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
 ]
 
+[[package]]
+name = "typing-inspect"
+version = "0.9.0"
+description = "Runtime inspection utilities for typing module."
+optional = false
+python-versions = "*"
+files = [
+    {file = "typing_inspect-0.9.0-py3-none-any.whl", hash = "sha256:9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f"},
+    {file = "typing_inspect-0.9.0.tar.gz", hash = "sha256:b23fc42ff6f6ef6954e4852c1fb512cdd18dbea03134f91f856a95ccc9461f78"},
+]
+
+[package.dependencies]
+mypy-extensions = ">=0.3.0"
+typing-extensions = ">=3.7.4"
+
 [[package]]
 name = "tzdata"
 version = "2024.1"
@@ -5514,6 +6359,85 @@ files = [
     {file = "widgetsnbextension-4.0.11.tar.gz", hash = "sha256:8b22a8f1910bfd188e596fe7fc05dcbd87e810c8a4ba010bdb3da86637398474"},
 ]
 
+[[package]]
+name = "wrapt"
+version = "1.16.0"
+description = "Module for decorators, wrappers and monkey patching."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "wrapt-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4"},
+    {file = "wrapt-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020"},
+    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440"},
+    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487"},
+    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf"},
+    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72"},
+    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0"},
+    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136"},
+    {file = "wrapt-1.16.0-cp310-cp310-win32.whl", hash = "sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d"},
+    {file = "wrapt-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2"},
+    {file = "wrapt-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09"},
+    {file = "wrapt-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d"},
+    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389"},
+    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060"},
+    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1"},
+    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3"},
+    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956"},
+    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d"},
+    {file = "wrapt-1.16.0-cp311-cp311-win32.whl", hash = "sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362"},
+    {file = "wrapt-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89"},
+    {file = "wrapt-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b"},
+    {file = "wrapt-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36"},
+    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73"},
+    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809"},
+    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b"},
+    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81"},
+    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9"},
+    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c"},
+    {file = "wrapt-1.16.0-cp312-cp312-win32.whl", hash = "sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc"},
+    {file = "wrapt-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8"},
+    {file = "wrapt-1.16.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8"},
+    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39"},
+    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c"},
+    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40"},
+    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc"},
+    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e"},
+    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465"},
+    {file = "wrapt-1.16.0-cp36-cp36m-win32.whl", hash = "sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e"},
+    {file = "wrapt-1.16.0-cp36-cp36m-win_amd64.whl", hash = "sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966"},
+    {file = "wrapt-1.16.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593"},
+    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292"},
+    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5"},
+    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf"},
+    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228"},
+    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f"},
+    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c"},
+    {file = "wrapt-1.16.0-cp37-cp37m-win32.whl", hash = "sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c"},
+    {file = "wrapt-1.16.0-cp37-cp37m-win_amd64.whl", hash = "sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00"},
+    {file = "wrapt-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0"},
+    {file = "wrapt-1.16.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202"},
+    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0"},
+    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e"},
+    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f"},
+    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267"},
+    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca"},
+    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6"},
+    {file = "wrapt-1.16.0-cp38-cp38-win32.whl", hash = "sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b"},
+    {file = "wrapt-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41"},
+    {file = "wrapt-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2"},
+    {file = "wrapt-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb"},
+    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8"},
+    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c"},
+    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a"},
+    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664"},
+    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f"},
+    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537"},
+    {file = "wrapt-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3"},
+    {file = "wrapt-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35"},
+    {file = "wrapt-1.16.0-py3-none-any.whl", hash = "sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1"},
+    {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"},
+]
+
 [[package]]
 name = "xxhash"
 version = "3.4.1"
@@ -5734,7 +6658,26 @@ files = [
 idna = ">=2.0"
 multidict = ">=4.0"
 
+[[package]]
+name = "zipp"
+version = "3.20.1"
+description = "Backport of pathlib-compatible object wrapper for zip files"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "zipp-3.20.1-py3-none-any.whl", hash = "sha256:9960cd8967c8f85a56f920d5d507274e74f9ff813a0ab8889a5b5be2daf44064"},
+    {file = "zipp-3.20.1.tar.gz", hash = "sha256:c22b14cc4763c5a5b04134207736c107db42e9d3ef2d9779d465f5f1bcba572b"},
+]
+
+[package.extras]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
+cover = ["pytest-cov"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+enabler = ["pytest-enabler (>=2.2)"]
+test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
+type = ["pytest-mypy"]
+
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.11, <4.0"
-content-hash = "dbe082993d282dcf88db95aced21881526ea92896c6d21f7cd30abc8542471e8"
+content-hash = "4d44108f296caafc4f938300bcd09141d2bed45c88bfbed06081be67f01ae868"
diff --git a/pyproject.toml b/pyproject.toml
index be553aad..a1334076 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,6 +60,10 @@ dspy-ai = "^2.4.13"
 # lightrag = { path = "lightrag", develop = true }
 transformers = "^4.44.0"
 accelerate = "^0.33.0"
+faiss-cpu = "^1.8.0.post1"
+nltk = "^3.9.1"
+deepeval = "^1.1.6"
+ragas = "^0.1.16"
 
 
 [build-system]
diff --git a/tutorials/evaluation/eval.py b/tutorials/evaluation/eval.py
deleted file mode 100644
index 70048752..00000000
--- a/tutorials/evaluation/eval.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from adalflow.eval import RetrieverRecall, RetrieverRelevance
-
-retrieved_contexts = [
-    "Apple is founded before Google.",
-    "Feburary has 28 days in common years. Feburary has 29 days in leap years. Feburary is the second month of the year.",
-]
-gt_contexts = [
-    [
-        "Apple is founded in 1976.",
-        "Google is founded in 1998.",
-        "Apple is founded before Google.",
-    ],
-    ["Feburary has 28 days in common years", "Feburary has 29 days in leap years"],
-]
-
-
-def evaluate_retriever(retrieved_contexts, gt_contexts):
-    retriever_recall = RetrieverRecall()
-    avg_recall, recall_list = retriever_recall.compute(
-        retrieved_contexts, gt_contexts
-    )  # Compute the recall of the retriever
-    retriever_relevance = RetrieverRelevance()
-    avg_relevance, relevance_list = retriever_relevance.compute(
-        retrieved_contexts, gt_contexts
-    )  # Compute the relevance of the retriever
-    return avg_recall, recall_list, avg_relevance, relevance_list
-
-
-if __name__ == "__main__":
-    avg_recall, recall_list, avg_relevance, relevance_list = evaluate_retriever(
-        retrieved_contexts, gt_contexts
-    )
-    print(f"avg_recall: {avg_recall}, recall_list: {recall_list}")
-    print(f"avg_relevance: {avg_relevance}, relevance_list: {relevance_list}")
diff --git a/tutorials/evaluation/eval_nlg.py b/tutorials/evaluation/eval_nlg.py
new file mode 100644
index 00000000..57fe8b06
--- /dev/null
+++ b/tutorials/evaluation/eval_nlg.py
@@ -0,0 +1,149 @@
+gt = "Brazil has won 5 FIFA World Cup titles"
+pred = "Brazil is the five-time champion of the FIFA WorldCup."
+
+
+def compute_rouge(gt, pred):
+    r"""
+    https://lightning.ai/docs/torchmetrics/stable/text/rouge_score.html
+    """
+    from torchmetrics.text.rouge import ROUGEScore
+
+    rouge = ROUGEScore()
+    return rouge(pred, gt)
+
+
+def compute_bleu(gt, pred):
+    r"""
+    https://lightning.ai/docs/torchmetrics/stable/text/bleu_score.html
+    """
+    from torchmetrics.text.bleu import BLEUScore
+
+    bleu = BLEUScore()
+    # preds = ["the cat is on the mat"]
+    # target = [["there is a cat on the mat", "a cat is on the mat"]]
+    # score = bleu(preds, target)
+    # print(f"score: {score}")
+    # print(f"pred: {[pred]}, gt: {[[gt]]}")
+    return bleu([pred], [[gt]])
+
+
+def compute_bertscore(gt, pred):
+    r"""
+    https://lightning.ai/docs/torchmetrics/stable/text/bert_score.html
+    """
+    from torchmetrics.text.bert import BERTScore
+
+    bertscore = BERTScore()
+    return bertscore([pred], [gt])
+
+
+def compute_llm_as_judge():
+    import adalflow as adal
+    from adalflow.eval.llm_as_judge import LLMasJudge, DefaultLLMJudge
+    from adalflow.components.model_client import OpenAIClient
+
+    adal.setup_env()
+
+    questions = [
+        "Is Beijing in China?",
+        "Is Apple founded before Google?",
+        "Is earth flat?",
+    ]
+    pred_answers = ["Yes", "Yes, Appled is founded before Google", "Yes"]
+    gt_answers = ["Yes", "Yes", "No"]
+
+    llm_judge = DefaultLLMJudge(
+        model_client=OpenAIClient(),
+        model_kwargs={
+            "model": "gpt-4o",
+            "temperature": 1.0,
+            "max_tokens": 10,
+        },
+    )
+    llm_evaluator = LLMasJudge(llm_judge=llm_judge)
+    print(llm_judge)
+    eval_rslt = llm_evaluator.compute(
+        questions=questions, gt_answers=gt_answers, pred_answers=pred_answers
+    )
+    print(eval_rslt)
+
+
+def compute_llm_as_judge_wo_questions():
+    import adalflow as adal
+    from adalflow.eval.llm_as_judge import LLMasJudge, DefaultLLMJudge
+    from adalflow.components.model_client import OpenAIClient
+
+    adal.setup_env()
+
+    llm_judge = DefaultLLMJudge(
+        model_client=OpenAIClient(),
+        model_kwargs={
+            "model": "gpt-4o",
+            "temperature": 1.0,
+            "max_tokens": 10,
+        },
+        jugement_query="Does the predicted answer means the same as the ground truth answer? Say True if yes, False if no.",
+    )
+    llm_evaluator = LLMasJudge(llm_judge=llm_judge)
+    print(llm_judge)
+    eval_rslt = llm_evaluator.compute(gt_answers=[gt], pred_answers=[pred])
+    print(eval_rslt)
+
+
+def compute_g_eval_summarization(source, summary):
+    from adalflow.eval.g_eval import GEvalLLMJudge, GEvalJudgeEvaluator, NLGTask
+
+    model_kwargs = {
+        "model": "gpt-4o",
+        "n": 20,
+        "top_p": 1,
+        "max_tokens": 5,
+        "temperature": 1,
+    }
+
+    g_eval = GEvalLLMJudge(
+        default_task=NLGTask.SUMMARIZATION, model_kwargs=model_kwargs
+    )
+    print(g_eval)
+    input_template = """Source Document: {source}
+    Summary: {summary}
+    """
+
+    input_str = input_template.format(
+        source=source,
+        summary=summary,
+    )
+
+    g_evaluator = GEvalJudgeEvaluator(llm_judge=g_eval)
+
+    response = g_evaluator(input_strs=[input_str])
+    print(f"response: {response}")
+
+
+if __name__ == "__main__":
+    import nltk
+
+    nltk.download("punkt_tab")
+    from adalflow.utils import setup_env
+
+    setup_env()
+    # print(f"ROUGE score: {compute_rouge(gt, pred)}")
+    # # fmeasure: 0.22, precision: 0.25
+    # print(f"BLEU score: {compute_bleu(gt, pred)}")
+    # # score: 0.0
+
+    # print(f"BERT score: {compute_bertscore(gt, pred)}")
+    # # score 0.9752, recall: 0.9827, and precision: 0.9789
+
+    # compute_llm_as_judge()
+
+    # compute_llm_as_judge_wo_questions()
+    source = (
+        "Paul Merson has restarted his row with Andros Townsend after the Tottenham midfielder was brought on with only seven minutes remaining in his team 's 0-0 draw with Burnley on Sunday . 'Just been watching the game , did you miss the coach ? # RubberDub # 7minutes , ' Merson put on Twitter . Merson initially angered Townsend for writing in his Sky Sports column that 'if Andros Townsend can get in ( the England team ) then it opens it up to anybody . ' Paul Merson had another dig at Andros Townsend after his appearance for Tottenham against Burnley Townsend was brought on in the 83rd minute for Tottenham as they drew 0-0 against Burnley Andros Townsend scores England 's equaliser in their 1-1 friendly draw with Italy in Turin on Tuesday night The former Arsenal man was proven wrong when Townsend hit a stunning equaliser for England against Italy and he duly admitted his mistake . 'It 's not as though I was watching hoping he would n't score for England , I 'm genuinely pleased for him and fair play to him \u00e2\u20ac\u201c it was a great goal , ' Merson said . 'It 's just a matter of opinion , and my opinion was that he got pulled off after half an hour at Manchester United in front of Roy Hodgson , so he should n't have been in the squad . 'When I 'm wrong , I hold my hands up . I do n't have a problem with doing that - I 'll always be the first to admit when I 'm wrong . ' Townsend hit back at Merson on Twitter after scoring for England against Italy Sky Sports pundit Merson ( centre ) criticised Townsend 's call-up to the England squad last week Townsend hit back at Merson after netting for England in Turin on Wednesday , saying 'Not bad for a player that should be 'nowhere near the squad ' ay @ PaulMerse ? ' Any bad feeling between the pair seemed to have passed but Merson was unable to resist having another dig at Townsend after Tottenham drew at Turf Moor .",
+    )
+    summary = (
+        "Paul merson was brought on with only seven minutes remaining in his team 's 0-0 draw with burnley . Andros townsend scored the tottenham midfielder in the 89th minute . Paul merson had another dig at andros townsend after his appearance . The midfielder had been brought on to the england squad last week . Click here for all the latest arsenal news news .",
+    )
+
+    compute_g_eval_summarization(source=source, summary=summary)
+    compute_g_eval_summarization(source=gt, summary=pred)
diff --git a/tutorials/evaluation/eval_retriever.py b/tutorials/evaluation/eval_retriever.py
new file mode 100644
index 00000000..2126ab1c
--- /dev/null
+++ b/tutorials/evaluation/eval_retriever.py
@@ -0,0 +1,25 @@
+from adalflow.eval import RetrieverRecall
+
+retrieved_contexts = [
+    "Apple is founded before Google.",
+    "Feburary has 28 days in common years. Feburary has 29 days in leap years. Feburary is the second month of the year.",
+]
+gt_contexts = [
+    [
+        "Apple is founded in 1976.",
+        "Google is founded in 1998.",
+        "Apple is founded before Google.",
+    ],
+    ["Feburary has 28 days in common years", "Feburary has 29 days in leap years"],
+]
+
+
+def evaluate_retriever(retrieved_contexts, gt_contexts):
+    retriever_recall = RetrieverRecall()
+    avg_recall, recall_list = retriever_recall.compute(retrieved_contexts, gt_contexts)
+    return avg_recall, recall_list
+
+
+if __name__ == "__main__":
+    avg_recall, recall_list = evaluate_retriever(retrieved_contexts, gt_contexts)
+    print(f"avg_recall: {avg_recall}, recall_list: {recall_list}")
diff --git a/use_cases/README.md b/use_cases/README.md
new file mode 100644
index 00000000..ed9c9960
--- /dev/null
+++ b/use_cases/README.md
@@ -0,0 +1,3 @@
+TODO List:
+
+1. Align LLM judge using annoated generated text - ground truth text pairs.
diff --git a/use_cases/classification/eval.py b/use_cases/classification/eval.py
index 9a6eff62..7d9ab9ed 100644
--- a/use_cases/classification/eval.py
+++ b/use_cases/classification/eval.py
@@ -71,7 +71,7 @@ def weights_per_class(
     targets = [0, 1, 2, 0, 0, 0]
     accuracy, macro_f1_score = evaluator.run(preds, targets)
     print(f"Accuracy: {accuracy}")
-    print(f"Micro F1 Score: {macro_f1_score}")
+    print(f"Macro F1 Score: {macro_f1_score}")
     print(type(accuracy))
     weights_per_class = evaluator.weights_per_class(preds, targets)
     print(f"weights_per_class: {weights_per_class}")
diff --git a/adalflow/adalflow/icl/__init__.py b/use_cases/rag/__init__.py
similarity index 100%
rename from adalflow/adalflow/icl/__init__.py
rename to use_cases/rag/__init__.py
diff --git a/use_cases/retrieval_augmented_generation/bootstrap_weak_model.py b/use_cases/rag/bootstrap_weak_model.py
similarity index 100%
rename from use_cases/retrieval_augmented_generation/bootstrap_weak_model.py
rename to use_cases/rag/bootstrap_weak_model.py
diff --git a/use_cases/rag/build/README.md b/use_cases/rag/build/README.md
new file mode 100644
index 00000000..336c1266
--- /dev/null
+++ b/use_cases/rag/build/README.md
@@ -0,0 +1,6 @@
+There are different patterns to build a RAG.
+
+1. RAG with separate data process pipeline and a RAG task pipeline.
+   This fits into a scenario where there is lots of data in production database, and we preprocess the data to embeddings and then we build a RAG task pipeline that retrieves context in multiple stages.
+
+2. RAG with dynamic data access. And cache the embedding dynamically in a local storage.
diff --git a/use_cases/unsorted/rag.py b/use_cases/rag/build/rag.py
similarity index 55%
rename from use_cases/unsorted/rag.py
rename to use_cases/rag/build/rag.py
index 8d5f0d48..e16acb9b 100644
--- a/use_cases/unsorted/rag.py
+++ b/use_cases/rag/build/rag.py
@@ -7,13 +7,13 @@
 from adalflow.utils import setup_env
 
 from adalflow.components.retriever.faiss_retriever import FAISSRetriever
-from adalflow.components.model_client import OpenAIClient
 
 from adalflow.components.data_process import (
     RetrieverOutputToContextStr,
     ToEmbeddings,
     TextSplitter,
 )
+from adalflow.utils.global_config import get_adalflow_default_root_path
 
 setup_env()
 # TODO: RAG can potentially be a component itsefl and be provided to the users
@@ -28,12 +28,15 @@
         },
     },
     "retriever": {
-        "top_k": 2,
+        "top_k": 5,
     },
     "generator": {
-        "model": "gpt-3.5-turbo",
-        "temperature": 0.3,
-        "stream": False,
+        "model_client": ModelClientType.OPENAI(),
+        "model_kwargs": {
+            "model": "gpt-3.5-turbo",
+            "temperature": 0.3,
+            "stream": False,
+        },
     },
     "text_splitter": {
         "split_by": "word",
@@ -57,7 +60,13 @@ def prepare_data_pipeline():
     return data_transformer
 
 
-def prepare_database_with_index(docs: List[Document], index_path: str = "index.faiss"):
+def prepare_database_with_index(
+    docs: List[Document],
+    index_file: str = "index.faiss",
+    index_path: Optional[str] = None,
+):
+    index_path = index_path or get_adalflow_default_root_path()
+    index_path = os.path.join(index_path, index_file)
     if os.path.exists(index_path):
         return None
     db = LocalDB()
@@ -82,14 +91,29 @@ def prepare_database_with_index(docs: List[Document], index_path: str = "index.f
 
 class RAG(Component):
 
-    def __init__(self, index_path: str = "index.faiss"):
+    def __init__(
+        self,
+        index_file: str = "index.faiss",
+        index_path: Optional[str] = None,
+        # model_client: ModelClient = ModelClientType.OPENAI(),
+        # model_kwargs: dict = None,
+        configs: dict = configs,
+    ):
         super().__init__()
 
-        self.db = LocalDB.load_state(index_path)
-
-        self.transformed_docs: List[Document] = self.db.get_transformed_data(
-            "data_transformer"
-        )
+        # 1. it can restore data from existing storage
+        index_path = index_path or get_adalflow_default_root_path()
+        index_path = os.path.join(index_path, index_file)
+        self.index_path = index_path
+        if not os.path.exists(index_path):
+            self.db = LocalDB()
+            self.register_data_transformer()
+            self.transformed_docs = []
+        else:
+            self.db = LocalDB.load_state(index_path)
+            self.transformed_docs: List[Document] = self.db.get_transformed_data(
+                "data_transformer"
+            )
         embedder = Embedder(
             model_client=ModelClientType.OPENAI(),
             model_kwargs=configs["embedder"]["model_kwargs"],
@@ -102,21 +126,35 @@ def __init__(self, index_path: str = "index.faiss"):
             document_map_func=lambda doc: doc.vector,
         )
         self.retriever_output_processors = RetrieverOutputToContextStr(deduplicate=True)
-        self.generator = Generator(
-            model_client=ModelClientType.OPENAI(),
-            model_kwargs=configs["generator"],
-            output_processors=JsonParser(),
-        )
 
         self.generator = Generator(
+            **configs["generator"],
             prompt_kwargs={
                 "task_desc_str": rag_prompt_task_desc,
             },
-            model_client=OpenAIClient(),
-            model_kwargs=configs["generator"],
             output_processors=JsonParser(),
         )
 
+    def register_data_transformer(self):
+        if "data_transformer" not in self.db.get_transformer_keys():
+            data_transformer = prepare_data_pipeline()
+            self.db.register_transformer(data_transformer, key="data_transformer")
+            print("Data transformer registered")
+
+    def add_documents(self, docs: List[Document]):
+        self.db.extend(docs, apply_transformer=True)
+        # save the state
+        self.db.save_state(self.index_path)
+
+    def get_transformed_docs(self, filter_func=None):
+        return self.db.get_transformed_data("data_transformer", filter_func)
+
+    def prepare_retriever(self, filter_func=None):
+        self.transformed_docs = self.get_transformed_docs(filter_func)
+        self.retriever.build_index_from_documents(
+            self.transformed_docs, document_map_func=lambda doc: doc.vector
+        )
+
     def generate(self, query: str, context: Optional[str] = None) -> Any:
         if not self.generator:
             raise ValueError("Generator is not set")
@@ -126,9 +164,9 @@ def generate(self, query: str, context: Optional[str] = None) -> Any:
             "input_str": query,
         }
         response = self.generator(prompt_kwargs=prompt_kwargs)
-        return response
+        return response, context
 
-    def call(self, query: str) -> Any:
+    def call(self, query: str, verbose: bool = False) -> Any:
         retrieved_documents = self.retriever(query)
         # fill in the document
         for i, retriever_output in enumerate(retrieved_documents):
@@ -136,11 +174,12 @@ def call(self, query: str) -> Any:
                 self.transformed_docs[doc_index]
                 for doc_index in retriever_output.doc_indices
             ]
-
-        print(f"retrieved_documents: \n {retrieved_documents}")
+        if verbose:
+            print(f"retrieved_documents: \n {retrieved_documents}")
         context_str = self.retriever_output_processors(retrieved_documents)
 
-        print(f"context_str: \n {context_str}")
+        if verbose:
+            print(f"context_str: \n {context_str}")
 
         return self.generate(query, context=context_str)
 
@@ -159,11 +198,33 @@ def call(self, query: str) -> Any:
         id="doc2",
     )
     # only run it once to prepare the data, if index exists, it will not run
-    prepare_database_with_index([doc1, doc2], index_path="index.faiss")
-    rag = RAG(index_path="index.faiss")
+    prepare_database_with_index([doc1, doc2], index_file="index.faiss")
+    rag = RAG(index_file="index.faiss")
     print(rag)
     query = "What is Li Yin's hobby and profession?"
 
     response = rag.call(query)
 
     print(f"response: {response}")
+
+    doc3 = Document(
+        meta_data={"title": "Apple's profile"},
+        text="Apple is a cute dog with black and tan fur"
+        + "lots of nonsense text" * 500,
+        id="doc3",
+    )
+    doc4 = Document(
+        meta_data={"title": "Apple's characteristics"},
+        text="lots of more nonsense text" * 250
+        + "Apple is energetic, loves to play with her monkey toy"
+        + "lots of more nonsense text" * 250,
+        id="doc4",
+    )
+    # Add more documents to the database at runtime
+    rag.add_documents([doc3, doc4])
+    rag.prepare_retriever()
+    response = rag.call("What is Apple's favorite toy?")
+    print(f"response: {response}")
+    print(rag.db.items)
+
+    # If you want to run a section time, please delete the index file or else there will be redundant data
diff --git a/use_cases/rag/rag_with_eval.py b/use_cases/rag/rag_with_eval.py
new file mode 100644
index 00000000..1e05a116
--- /dev/null
+++ b/use_cases/rag/rag_with_eval.py
@@ -0,0 +1,113 @@
+import datasets
+from typing import List, Union
+
+import adalflow as adal
+from use_cases.rag.build.rag import (
+    RAG,
+)
+from adalflow.eval.retriever_recall import RetrieverRecall
+from adalflow.eval.answer_match_acc import AnswerMatchAcc
+from adalflow.eval.llm_as_judge import LLMasJudge
+
+
+def load_hotpot_qa():
+    dataset = datasets.load_dataset(path="hotpot_qa", name="fullwiki")
+    selected_dataset = dataset["train"].select(range(5))
+    print(f"example: {selected_dataset[0]}")
+    print(f"ground truth context: {selected_dataset[0]['supporting_facts']}")
+    return selected_dataset
+
+
+def get_supporting_sentences(
+    supporting_facts: dict[str, list[Union[str, int]]], context: dict[str, list[str]]
+) -> List[str]:
+    """
+    Extract the supporting sentences from the context based on the supporting facts.
+    """
+    extracted_sentences = []
+    for title, sent_id in zip(supporting_facts["title"], supporting_facts["sent_id"]):
+        if title in context["title"]:
+            index = context["title"].index(title)
+            sentence = context["sentences"][index][sent_id]
+            extracted_sentences.append(sentence)
+    return extracted_sentences
+
+
+def prepare_documents(dataset):
+    # For production use cases, you might consider batching the documents using a data loader
+    docs = []
+    for data in dataset:
+        num_docs = len(data["context"]["title"])
+        id = data["id"]
+        doc_list = [
+            adal.Document(
+                meta_data={"title": data["context"]["title"][i]},
+                text=f"title: {data['context']['title'][i]} "
+                + ", sentences: "
+                + " ".join(data["context"]["sentences"][i]),
+                id=f"doc_{id}_{i}",
+            )
+            for i in range(num_docs)
+        ]
+        docs.extend(doc_list)
+    return docs
+
+
+def add_all_documents_to_rag_db(rag):
+    dataset = load_hotpot_qa()
+    docs = prepare_documents(dataset)
+    rag.add_documents(docs)  # add all documents to the database
+
+
+if __name__ == "__main__":
+
+    rag = RAG(index_file="hotpot_qa_index.faiss")
+    # add_all_documents_to_rag_db(rag)
+    print(rag.transformed_docs)
+
+    dataset = load_hotpot_qa()
+    questions, retrieved_contexts, gt_contexts, pred_answers, gt_answers = (
+        [],
+        [],
+        [],
+        [],
+        [],
+    )
+    for item in dataset:
+        id = item["id"]
+        doc_ids = [f"doc_{id}_{i}" for i in range(len(item["context"]["title"]))]
+        # transformed_docs = rag.get_transformed_docs(
+        #     filter_func=lambda x: id in x.parent_doc_id
+        # )
+        # print(f"id: {id}")
+        # print(f"transformed_docs: {[ (doc.id, doc.order, doc.parent_doc_id)
+        #                              for doc in transformed_docs]}")
+        rag.prepare_retriever(filter_func=lambda x: x.parent_doc_id in doc_ids)
+        response, context_str = rag.call(item["question"])
+        gt_context_sentence_list = get_supporting_sentences(
+            item["supporting_facts"], item["context"]
+        )
+        print(f"gt_context_sentence_list: {gt_context_sentence_list}")
+        questions.append(item["question"])
+        retrieved_contexts.append(context_str)
+        gt_contexts.append(gt_context_sentence_list)
+        pred_answers.append(response.data["answer"])
+        gt_answers.append(item["answer"])
+        print(f"question: {item['question']}")
+        print(f"retrieved context: {context_str}")
+        print(f"ground truth context: {gt_context_sentence_list}")
+        print(f"predicted answer: {response.data['answer']}")
+        print(f"ground truth answer: {item['answer']}")
+
+    avg_recall = RetrieverRecall().compute(retrieved_contexts, gt_contexts)
+    answer_match_acc = AnswerMatchAcc(type="fuzzy_match")
+    acc_rslt = answer_match_acc.compute(
+        pred_answers=pred_answers, gt_answers=gt_answers
+    )
+    llm_judge = LLMasJudge()
+    judge_acc_rslt = llm_judge.compute(
+        questions=questions, gt_answers=gt_answers, pred_answers=pred_answers
+    )
+    print(f"judge_acc_rslt: {judge_acc_rslt}")
+    print(f"avg_recall: {avg_recall}")
+    print(f"avg_acc: {acc_rslt}")
diff --git a/use_cases/retrieval_augmented_generation/simple_rag.yaml b/use_cases/rag/simple_rag.yaml
similarity index 100%
rename from use_cases/retrieval_augmented_generation/simple_rag.yaml
rename to use_cases/rag/simple_rag.yaml
diff --git a/use_cases/retrieval_augmented_generation/__init__.py b/use_cases/retrieval_augmented_generation/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/use_cases/retrieval_augmented_generation/simple_rag.ipynb b/use_cases/retrieval_augmented_generation/simple_rag.ipynb
deleted file mode 100644
index 19a70726..00000000
--- a/use_cases/retrieval_augmented_generation/simple_rag.ipynb
+++ /dev/null
@@ -1,429 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Build a Simple Retrieval-Augmented Generation (RAG) Pipeline"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this use case, we show how to build and evaluate a simple RAG pipeline with LightRAG. RAG (Retrieval-Augmented Generation) pipelines leverage a retriever to fetch relevant context from a knowledge base (e.g., a document database) which is then fed to an LLM generator with the query to produce the answer. This allows the model to generate more contextually relevant answers."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Import needed modules, including modules for loading datasets, constructing a RAG pipeline, and evaluating the performance of the RAG pipeline.\n",
-    "import yaml\n",
-    "from typing import Any, List, Optional, Union\n",
-    "\n",
-    "from datasets import load_dataset\n",
-    "\n",
-    "from lightrag.core.types import Document\n",
-    "from lightrag.core.component import Component, Sequential\n",
-    "from lightrag.core.embedder import Embedder\n",
-    "from lightrag.core.document_splitter import DocumentSplitter\n",
-    "from lightrag.core.data_components import (\n",
-    "    RetrieverOutputToContextStr,\n",
-    "    ToEmbeddings,\n",
-    ")\n",
-    "from lightrag.components.retriever import FAISSRetriever\n",
-    "from lightrag.core.generator import Generator\n",
-    "from lightrag.core.db import LocalDocumentDB\n",
-    "from lightrag.core.string_parser import JsonParser\n",
-    "\n",
-    "from lightrag.eval import (\n",
-    "    AnswerMatchAcc,\n",
-    "    RetrieverRecall,\n",
-    "    RetrieverRelevance,\n",
-    "    LLMasJudge,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "False"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Here, we use the OpenAIClient in the Generator as an example, but you can use any other clients (with the corresponding API Key as needed)\n",
-    "from lightrag.components.model_client import OpenAIClient\n",
-    "import os\n",
-    "os.environ[\"KMP_DUPLICATE_LIB_OK\"] = \"True\"\n",
-    "import dotenv\n",
-    "# load evironment\n",
-    "dotenv.load_dotenv(dotenv_path=\".env\", override=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Define the configuration for the RAG pipeline**. We load the configuration from a YAML file. This configuration specifies the components of the RAG pipeline, including the text_splitter, vectorizer, retriever, and generator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'vectorizer': {'batch_size': 100, 'model_kwargs': {'model': 'text-embedding-3-small', 'dimensions': 256, 'encoding_format': 'float'}}, 'retriever': {'top_k': 2}, 'generator': {'model': 'gpt-3.5-turbo', 'temperature': 0.3, 'stream': False}, 'text_splitter': {'split_by': 'sentence', 'chunk_size': 1, 'chunk_overlap': 0}}\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Define the configuration settings for the RAG pipeline.\n",
-    "with open(\"./simple_rag.yaml\", \"r\") as file:\n",
-    "    settings = yaml.safe_load(file)\n",
-    "print(settings)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Load a dataset**. Here, We use the [HotpotQA](https://huggingface.co/datasets/hotpotqa/hotpot_qa) dataset as an example. Each data sample in HotpotQA has *question*, *answer*, *context* and *supporting_facts* selected from the whole context."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "example: {'id': '5a7a06935542990198eaf050', 'question': \"Which magazine was started first Arthur's Magazine or First for Women?\", 'answer': \"Arthur's Magazine\", 'type': 'comparison', 'level': 'medium', 'supporting_facts': {'title': [\"Arthur's Magazine\", 'First for Women'], 'sent_id': [0, 0]}, 'context': {'title': ['Radio City (Indian radio station)', 'History of Albanian football', 'Echosmith', \"Women's colleges in the Southern United States\", 'First Arthur County Courthouse and Jail', \"Arthur's Magazine\", '2014–15 Ukrainian Hockey Championship', 'First for Women', 'Freeway Complex Fire', 'William Rast'], 'sentences': [[\"Radio City is India's first private FM radio station and was started on 3 July 2001.\", ' It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003).', ' It plays Hindi, English and regional songs.', ' It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007.', ' Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.', ' The Radio station currently plays a mix of Hindi and Regional music.', ' Abraham Thomas is the CEO of the company.'], ['Football in Albania existed before the Albanian Football Federation (FSHF) was created.', \" This was evidenced by the team's registration at the Balkan Cup tournament during 1929-1931, which started in 1929 (although Albania eventually had pressure from the teams because of competition, competition started first and was strong enough in the duels) .\", ' Albanian National Team was founded on June 6, 1930, but Albania had to wait 16 years to play its first international match and then defeated Yugoslavia in 1946.', ' In 1932, Albania joined FIFA (during the 12–16 June convention ) And in 1954 she was one of the founding members of UEFA.'], ['Echosmith is an American, Corporate indie pop band formed in February 2009 in Chino, California.', ' Originally formed as a quartet of siblings, the band currently consists of Sydney, Noah and Graham Sierota, following the departure of eldest sibling Jamie in late 2016.', ' Echosmith started first as \"Ready Set Go!\"', ' until they signed to Warner Bros.', ' Records in May 2012.', ' They are best known for their hit song \"Cool Kids\", which reached number 13 on the \"Billboard\" Hot 100 and was certified double platinum by the RIAA with over 1,200,000 sales in the United States and also double platinum by ARIA in Australia.', ' The song was Warner Bros.', \" Records' fifth-biggest-selling-digital song of 2014, with 1.3 million downloads sold.\", ' The band\\'s debut album, \"Talking Dreams\", was released on October 8, 2013.'], [\"Women's colleges in the Southern United States refers to undergraduate, bachelor's degree–granting institutions, often liberal arts colleges, whose student populations consist exclusively or almost exclusively of women, located in the Southern United States.\", \" Many started first as girls' seminaries or academies.\", ' Salem College is the oldest female educational institution in the South and Wesleyan College is the first that was established specifically as a college for women.', ' Some schools, such as Mary Baldwin University and Salem College, offer coeducational courses at the graduate level.'], ['The First Arthur County Courthouse and Jail, was perhaps the smallest court house in the United States, and serves now as a museum.'], [\"Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.\", ' Edited by T.S. Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.', ' In May 1846 it was merged into \"Godey\\'s Lady\\'s Book\".'], ['The 2014–15 Ukrainian Hockey Championship was the 23rd season of the Ukrainian Hockey Championship.', ' Only four teams participated in the league this season, because of the instability in Ukraine and that most of the clubs had economical issues.', ' Generals Kiev was the only team that participated in the league the previous season, and the season started first after the year-end of 2014.', ' The regular season included just 12 rounds, where all the teams went to the semifinals.', ' In the final, ATEK Kiev defeated the regular season winner HK Kremenchuk.'], [\"First for Women is a woman's magazine published by Bauer Media Group in the USA.\", ' The magazine was started in 1989.', ' It is based in Englewood Cliffs, New Jersey.', ' In 2011 the circulation of the magazine was 1,310,696 copies.'], ['The Freeway Complex Fire was a 2008 wildfire in the Santa Ana Canyon area of Orange County, California.', ' The fire started as two separate fires on November 15, 2008.', ' The \"Freeway Fire\" started first shortly after 9am with the \"Landfill Fire\" igniting approximately 2 hours later.', ' These two separate fires merged a day later and ultimately destroyed 314 residences in Anaheim Hills and Yorba Linda.'], ['William Rast is an American clothing line founded by Justin Timberlake and Trace Ayala.', ' It is most known for their premium jeans.', ' On October 17, 2006, Justin Timberlake and Trace Ayala put on their first fashion show to launch their new William Rast clothing line.', ' The label also produces other clothing items such as jackets and tops.', ' The company started first as a denim line, later evolving into a men’s and women’s clothing line.']]}}\n",
-      "ground truth context: {'title': [\"Arthur's Magazine\", 'First for Women'], 'sent_id': [0, 0]}\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/mengliu/Library/Caches/pypoetry/virtualenvs/lightrag-project-OrKUABKc-py3.12/lib/python3.12/site-packages/datasets/table.py:1421: FutureWarning: promote has been superseded by promote_options='default'.\n",
-      "  table = cls._concat_blocks(blocks, axis=0)\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Load the HotpotQA dataset. We select a subset of the dataset for demonstration purposes.\n",
-    "dataset = load_dataset(path=\"hotpot_qa\", name=\"fullwiki\")\n",
-    "selected_dataset = dataset[\"train\"].select(range(5))\n",
-    "print(f\"example: {selected_dataset[0]}\")\n",
-    "print(f\"ground truth context: {selected_dataset[0]['supporting_facts']}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Define a simple RAG pipeline**. Define a RAG pipeline by specifying the key components, such as *vectorizer*, *retriever*, and *generator*. For more information on these components, refer to the developer notes."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# The defined RAG pipeline.\n",
-    "class RAG(Component):\n",
-    "\n",
-    "    def __init__(self, settings: dict):\n",
-    "        super().__init__()\n",
-    "        self.vectorizer_settings = settings[\"vectorizer\"]\n",
-    "        self.retriever_settings = settings[\"retriever\"]\n",
-    "        self.generator_model_kwargs = settings[\"generator\"]\n",
-    "        self.text_splitter_settings = settings[\"text_splitter\"]\n",
-    "\n",
-    "        vectorizer = Embedder(\n",
-    "            model_client=OpenAIClient(),\n",
-    "            model_kwargs=self.vectorizer_settings[\"model_kwargs\"],\n",
-    "        )\n",
-    "\n",
-    "        text_splitter = DocumentSplitter(\n",
-    "            split_by=self.text_splitter_settings[\"split_by\"],\n",
-    "            split_length=self.text_splitter_settings[\"chunk_size\"],\n",
-    "            split_overlap=self.text_splitter_settings[\"chunk_overlap\"],\n",
-    "        )\n",
-    "        self.data_transformer = Sequential(\n",
-    "            text_splitter,\n",
-    "            ToEmbeddings(\n",
-    "                vectorizer=vectorizer,\n",
-    "                batch_size=self.vectorizer_settings[\"batch_size\"],\n",
-    "            ),\n",
-    "        )\n",
-    "        self.data_transformer_key = self.data_transformer._get_name()\n",
-    "        # initialize retriever, which depends on the vectorizer too\n",
-    "        self.retriever = FAISSRetriever(\n",
-    "            top_k=self.retriever_settings[\"top_k\"],\n",
-    "            dimensions=self.vectorizer_settings[\"model_kwargs\"][\"dimensions\"],\n",
-    "            vectorizer=vectorizer,\n",
-    "        )\n",
-    "        self.retriever_output_processors = RetrieverOutputToContextStr(deduplicate=True)\n",
-    "\n",
-    "        self.db = LocalDocumentDB()\n",
-    "\n",
-    "        # initialize generator\n",
-    "        self.generator = Generator(\n",
-    "            preset_prompt_kwargs={\n",
-    "                \"task_desc_str\": r\"\"\"\n",
-    "                    You are a helpful assistant.\n",
-    "\n",
-    "                    Your task is to answer the query that may or may not come with context information.\n",
-    "                    When context is provided, you should stick to the context and less on your prior knowledge to answer the query.\n",
-    "\n",
-    "                    Output JSON format:\n",
-    "                    {\n",
-    "                        \"answer\": \"The answer to the query\",\n",
-    "                    }\"\"\"\n",
-    "            },\n",
-    "            model_client=OpenAIClient(),\n",
-    "            model_kwargs=self.generator_model_kwargs,\n",
-    "            output_processors=JsonParser(),\n",
-    "        )\n",
-    "        self.tracking = {\"vectorizer\": {\"num_calls\": 0, \"num_tokens\": 0}}\n",
-    "\n",
-    "    def build_index(self, documents: List[Document]):\n",
-    "        self.db.load_documents(documents)\n",
-    "        self.map_key = self.db.map_data()\n",
-    "        print(f\"map_key: {self.map_key}\")\n",
-    "        self.data_key = self.db.transform_data(self.data_transformer)\n",
-    "        print(f\"data_key: {self.data_key}\")\n",
-    "        self.transformed_documents = self.db.get_transformed_data(self.data_key)\n",
-    "        self.retriever.build_index_from_documents(self.transformed_documents)\n",
-    "\n",
-    "    def generate(self, query: str, context: Optional[str] = None) -> Any:\n",
-    "        if not self.generator:\n",
-    "            raise ValueError(\"Generator is not set\")\n",
-    "\n",
-    "        prompt_kwargs = {\n",
-    "            \"context_str\": context,\n",
-    "            \"input_str\": query,\n",
-    "        }\n",
-    "        response = self.generator(prompt_kwargs=prompt_kwargs)\n",
-    "        if response.error:\n",
-    "            raise ValueError(f\"Error in generator: {response.error}\")\n",
-    "        return response.data\n",
-    "\n",
-    "    def call(self, query: str) -> Any:\n",
-    "        retrieved_documents = self.retriever(query)\n",
-    "        # fill in the document\n",
-    "        for i, retriever_output in enumerate(retrieved_documents):\n",
-    "            retrieved_documents[i].documents = [\n",
-    "                self.transformed_documents[doc_index]\n",
-    "                for doc_index in retriever_output.doc_indexes\n",
-    "            ]\n",
-    "        # convert all the documents to context string\n",
-    "        context_str = self.retriever_output_processors(retrieved_documents)\n",
-    "\n",
-    "        return self.generate(query, context=context_str), context_str"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "To run the RAG piepline for each example in the dataset, we need to first **build the index** and then **call the pipeline**. For each sample in the dataset, we create a list of documents to retrieve from, according to its corresponding *context* in the dataset. Each document has a title and a list of sentences. We use the `Document` class from `lightrag.core.types` to represent each document."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# To get the ground truth context string from the supporting_facts filed in HotpotQA. This function is specific to the HotpotQA dataset.\n",
-    "def get_supporting_sentences(\n",
-    "    supporting_facts: dict[str, list[Union[str, int]]], context: dict[str, list[str]]\n",
-    ") -> List[str]:\n",
-    "    \"\"\"\n",
-    "    Extract the supporting sentences from the context based on the supporting facts.\n",
-    "    \"\"\"\n",
-    "    extracted_sentences = []\n",
-    "    for title, sent_id in zip(supporting_facts[\"title\"], supporting_facts[\"sent_id\"]):\n",
-    "        if title in context[\"title\"]:\n",
-    "            index = context[\"title\"].index(title)\n",
-    "            sentence = context[\"sentences\"][index][sent_id]\n",
-    "            extracted_sentences.append(sentence)\n",
-    "    return extracted_sentences\n",
-    "\n",
-    "\n",
-    "questions = []\n",
-    "retrieved_contexts = []\n",
-    "gt_contexts = []\n",
-    "pred_answers = []\n",
-    "gt_answers = []\n",
-    "for data in selected_dataset:\n",
-    "    # build the document list\n",
-    "    num_docs = len(data[\"context\"][\"title\"])\n",
-    "    doc_list = [\n",
-    "        Document(\n",
-    "            meta_data={\"title\": data[\"context\"][\"title\"][i]},\n",
-    "            text=\" \".join(data[\"context\"][\"sentences\"][i]),\n",
-    "        )\n",
-    "        for i in range(num_docs)\n",
-    "    ]\n",
-    "    rag = RAG(settings)\n",
-    "    # build the index\n",
-    "    rag.build_index(doc_list)\n",
-    "    # call the pipeline\n",
-    "    query = data[\"question\"]\n",
-    "    response, context_str = rag.call(query)\n",
-    "    gt_context_sentence_list = get_supporting_sentences(\n",
-    "        data[\"supporting_facts\"], data[\"context\"]\n",
-    "    )\n",
-    "    questions.append(query)\n",
-    "    retrieved_contexts.append(context_str)\n",
-    "    gt_contexts.append(gt_context_sentence_list)\n",
-    "    pred_answers.append(response[\"answer\"])\n",
-    "    gt_answers.append(data[\"answer\"])\n",
-    "    print(f\"query: {query}\")\n",
-    "    print(f\"response: {response['answer']}\")\n",
-    "    print(f\"ground truth response: {data['answer']}\")\n",
-    "    print(f\"context_str: {context_str}\")\n",
-    "    print(f\"ground truth context_str: {gt_context_sentence_list}\")\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Evaluate the performance of the RAG pipeline**. We first evaluate the performance of the retriever component by calculating the *recall* of the retrieved context and the *relevance* score of the retrieved context."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'retrieved_contexts' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[7], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# Compute the recall.\u001b[39;00m\n\u001b[1;32m      2\u001b[0m retriever_recall \u001b[38;5;241m=\u001b[39m RetrieverRecall()\n\u001b[0;32m----> 3\u001b[0m avg_recall, recall_list \u001b[38;5;241m=\u001b[39m retriever_recall\u001b[38;5;241m.\u001b[39mcompute(\u001b[43mretrieved_contexts\u001b[49m, gt_contexts)\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maverage recall: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mavg_recall\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrecall list: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrecall_list\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
-      "\u001b[0;31mNameError\u001b[0m: name 'retrieved_contexts' is not defined"
-     ]
-    }
-   ],
-   "source": [
-    "# Compute the recall.\n",
-    "retriever_recall = RetrieverRecall()\n",
-    "avg_recall, recall_list = retriever_recall.compute(retrieved_contexts, gt_contexts)\n",
-    "print(f\"average recall: {avg_recall}\")\n",
-    "print(f\"recall list: {recall_list}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Compute the relevance.\n",
-    "retriever_relevance = RetrieverRelevance()\n",
-    "avg_relevance, relevance_list = retriever_relevance.compute(\n",
-    "    retrieved_contexts, gt_contexts\n",
-    ")\n",
-    "print(f\"average relevance: {avg_relevance}\")\n",
-    "print(f\"relevance list: {relevance_list}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Next, we evaluate the generated answers using the AnswerMatchAcc metric, which compares the predicted answer with the ground truth answer."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Compute the answer match accuracy.\n",
-    "answer_match_acc = AnswerMatchAcc(type=\"exact_match\")\n",
-    "avg_acc, acc_list = answer_match_acc.compute(pred_answers, gt_answers)\n",
-    "print(f\"average accuracy: {avg_acc}\")\n",
-    "print(f\"accuracy list: {acc_list}\")\n",
-    "answer_match_acc = AnswerMatchAcc(type=\"fuzzy_match\")\n",
-    "avg_acc, acc_list = answer_match_acc.compute(pred_answers, gt_answers)\n",
-    "print(f\"average accuracy: {avg_acc}\")\n",
-    "print(f\"accuracy list: {acc_list}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We finally use an LLM as the judge for evaluating the performance. The task description in the `DEFAULT_LLM_EVALUATOR_PROMPT` is \"You are a helpful assistant. Given the question, ground truth answer, and predicted answer, you need to answer the judgement query. Output True or False according to the judgement query.\" You can customize the task description as needed. See the `lightrag.eval.LLMasJudge` class for more details."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "llm_judge = LLMasJudge()\n",
-    "judgement_query = (\n",
-    "        \"For the question, does the predicted answer contain the ground truth answer?\"\n",
-    "    )\n",
-    "avg_judgement, judgement_list = llm_judge.compute(\n",
-    "    questions, gt_answers, pred_answers, judgement_query\n",
-    ")\n",
-    "print(f\"average judgement: {avg_judgement}\")\n",
-    "print(f\"judgement list: {judgement_list}\")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "lightrag-project",
-   "language": "python",
-   "name": "light-rag-project"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/use_cases/retrieval_augmented_generation/user_code.py b/use_cases/retrieval_augmented_generation/user_code.py
deleted file mode 100644
index bb2c0ca5..00000000
--- a/use_cases/retrieval_augmented_generation/user_code.py
+++ /dev/null
@@ -1,290 +0,0 @@
-from typing import Any, Dict, Optional
-import logging
-
-import torch
-
-from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
-
-from adalflow.core.model_client import ModelClient
-
-log = logging.getLogger(__name__)
-
-
-class TransformerLLM:
-    models: Dict[str, type] = {}
-
-    def __init__(self, model_name: Optional[str] = None):
-        super().__init__()
-        if model_name is not None:
-            self.model_name = model_name
-            """Lazy intialisation of the model in TransformerClient.init_sync_client()"""
-
-    def init_model(
-        self,
-        model_name: str,
-        auto_model: Optional[type] = AutoModel,
-        auto_tokenizer: Optional[type] = AutoTokenizer,
-    ):
-        try:
-            self.tokenizer = auto_tokenizer.from_pretrained(model_name)
-            self.model = auto_model.from_pretrained(model_name, is_decoder=True)
-            # register the model
-            self.models[model_name] = self.model
-            self.device = "cuda" if torch.cuda.is_available() else "cpu"
-            log.info(f"Done loading model {model_name}")
-            # Set pad token if it's not already set
-            if self.tokenizer.pad_token is None:
-                self.tokenizer.pad_token = self.tokenizer.eos_token  # common fallback
-                self.model.config.pad_token_id = (
-                    self.tokenizer.eos_token_id
-                )  # ensure consistency in the model config
-        except Exception as e:
-            log.error(f"Error loading model {model_name}: {e}")
-            raise e
-
-    def parse_chat_completion(self, input_text: str, response: str):
-        print("|" * 24)
-        print("input lowered")
-        print("|" * 24)
-        print(input_text.lower().replace("\n", ""))
-        print("|" * 24)
-        print("|" * 24)
-        print("response lowered")
-        print("|" * 24)
-        print(response.lower().replace("\n", ""))
-        print("|" * 24)
-        if input_text.lower() in response.lower():
-            parsed_response = response.replace(
-                input_text, ""
-            ).strip()  # Safely handle cases where input_text might not be in response
-        else:
-            parsed_response = response
-        if "xxxxx" in parsed_response:
-            cut_idx = parsed_response.find("xxxxx")
-            parsed_response = parsed_response[cut_idx + 5 :]
-        print(parsed_response)
-        return parsed_response
-
-    def call(
-        self,
-        input_text: str,
-        skip_special_tokens: bool = True,
-        clean_up_tokenization_spaces: bool = False,
-        max_length: int = 150,
-    ):
-        if not self.model:
-            log.error("Model is not initialized.")
-            raise ValueError("Model is not initialized.")
-
-        # Ensure tokenizer has pad token; set it if not
-        if self.tokenizer.pad_token is None:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-            self.model.config.pad_token_id = (
-                self.tokenizer.eos_token_id
-            )  # Sync model config pad token id
-
-        # Process inputs with attention mask and padding
-        inputs = self.tokenizer(input_text, return_tensors="pt", padding=True).to(
-            self.device
-        )
-
-        with torch.no_grad():  # Ensures no gradients are calculated to save memory and computations
-            generate_ids = self.model.generate(
-                inputs["input_ids"],
-                attention_mask=inputs["attention_mask"],
-                max_length=max_length,  # Control the output length more precisely
-                repetition_penalty=5.0,
-            )
-        response = self.tokenizer.decode(
-            generate_ids[0],
-            skip_special_tokens=skip_special_tokens,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-        )
-        print("o" * 24)
-        print("raw LLM output")
-        print("o" * 24)
-        print(response)
-        print("o" * 24)
-
-        print("o" * 24)
-        print("input text")
-        print("o" * 24)
-        print(input_text)
-        print("o" * 24)
-        parsed_response = self.parse_chat_completion(
-            input_text=input_text, response=response
-        )
-        return parsed_response
-
-    def __call__(
-        self,
-        input_text: str,
-        skip_special_tokens: bool = True,
-        clean_up_tokenization_spaces: bool = False,
-        max_length: int = 150,
-        model=None,  # For compantibility with Generator ||||| might be something to fix in Generator source code -> api_kwargs always contains a 'model' arguments. 'model' is parsed either in call() or in __init__(),
-        **kwargs,
-    ):
-
-        return self.call(
-            input_text=input_text,
-            skip_special_tokens=skip_special_tokens,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            max_length=max_length,
-        )
-
-
-class CustomLlmModelClient(ModelClient):
-
-    def __init__(
-        self,
-        llm_model: TransformerLLM,
-        auto_model: Optional[type] = AutoModel,
-        auto_tokenizer: Optional[type] = AutoTokenizer,
-    ) -> None:
-        super().__init__()
-        self.transformer_llm = llm_model
-        self.llm_client = self.init_llm_client(
-            auto_model=auto_model, auto_tokenizer=auto_tokenizer
-        )
-
-    def init_llm_client(
-        self,
-        auto_model: Optional[type] = AutoModel,
-        auto_tokenizer: Optional[type] = AutoTokenizer,
-    ):
-        model_name = self.transformer_llm.model_name
-        self.transformer_llm.init_model(
-            model_name, auto_model=auto_model, auto_tokenizer=auto_tokenizer
-        )
-        """The transformerLlm is initialised by the user so I removed the parenthesis from the return statement to avoid executing self.transformer_llm.call()"""
-        return self.transformer_llm
-
-    def call(
-        self, api_kwargs: Dict = {}, model_type=None  # For compatibility with Generator
-    ):
-        if "model" not in api_kwargs:
-            raise ValueError("model must be specified in api_kwargs")
-        if not hasattr(self, "llm_client") or self.llm_client is None:
-            self.llm_client = self.init_llm_client()
-        response = self.llm_client(**api_kwargs)
-        return response
-
-    def convert_inputs_to_api_kwargs(
-        self,
-        input: Any,  # for retriever, it is a single query,
-        model_kwargs: dict = {},
-        model_type=None,  # For compantibility with Generator
-    ) -> dict:
-        final_model_kwargs = model_kwargs.copy()
-        assert "model" in final_model_kwargs, "model must be specified"
-        final_model_kwargs["input_text"] = input
-        return final_model_kwargs
-
-    def parse_chat_completion(self, completion: str):
-        """Method implemented for compatibility with Generator.
-        Return the input of the function without changing it.
-        """
-        return completion
-
-
-if __name__ == "__main__":
-    from adalflow.core import Generator
-
-    MODEL = "BAAI/bge-small-en-v1.5"
-    context = "Brian is in the kitchen."
-    query = "where is brian?"
-
-    rag_prompt_task_desc = {
-        "task_desc_str": r"""
-    You are a helpful assistant.
-
-    Your task is to answer the query that may or may not come with context information.
-    When context is provided, you should stick to the context and less on your prior knowledge to answer the query.
-
-    Insert your answer to the query.
-
-    xxxxx
-    """
-    }
-    template = """
-            <start_of_system_prompt>
-            {# task desc #}
-            {% if task_desc_str %}
-                {{task_desc_str}}
-            {% else %}
-                You are a helpful assistant.
-            {% endif %}
-
-            {# output format #}
-            {% if output_format_str %}
-                <OUTPUT_FORMAT>
-                    {{output_format_str}}
-                </OUTPUT_FORMAT>
-            {% endif %}
-
-            {# tools #}
-            {% if tools_str %}
-                <TOOLS>
-                    {{tools_str}}
-                </TOOLS>
-            {% endif %}
-
-            {# example #}
-            {% if examples_str %}
-                <EXAMPLES>
-                    {{examples_str}}
-                </EXAMPLES>
-            {% endif %}
-
-            {# chat history #}
-            {% if chat_history_str %}
-                <CHAT_HISTORY>
-                    {{chat_history_str}}
-                </CHAT_HISTORY>
-            {% endif %}
-
-            {# context #}
-            {% if context_str %}
-                <CONTEXT>
-                    {{context_str}}
-                </CONTEXT>
-            {% endif %}
-
-            {# steps #}
-            {% if steps_str %}
-                <STEPS>
-                    {{steps_str}}
-                </STEPS>
-            {% endif %}
-
-            </end_of_system_prompt>
-
-            {% if input_str %}
-                <User>
-                    {{input_str}}
-                </User>
-            {% endif %}
-            You:
-
-
-            """
-
-    prompt_kwargs = {
-        "input_str": query,
-        "context_str": context,
-    }
-    model_kwargs = {"model": MODEL, "temperature": 1, "stream": False}
-    transformer_llm = TransformerLLM(MODEL)
-    llm_client = CustomLlmModelClient(transformer_llm, auto_model=AutoModelForCausalLM)
-    generator = Generator(
-        template=template,
-        model_client=llm_client,
-        model_kwargs=model_kwargs,
-        prompt_kwargs=rag_prompt_task_desc,
-        # output_processors=JsonParser()
-    )
-    print("-" * 24)
-    response = generator(prompt_kwargs)
-    print(response)
-    print("-" * 24)