hillaryke · hillaryke · Aug 2, 2024 · Aug 1, 2024 · Aug 1, 2024 · Aug 1, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,4 @@
-data
+
 
 tmp
 chroma

diff --git a/Makefile b/Makefile
@@ -12,5 +12,8 @@ test:
 add-dev_%:
 	poetry add --group dev $*
 
-run:
-	poetry run python -m main
+setup:
+	poetry run python -m main
+
+run-backend:
+	poetry run uvicorn backend.app.main:app --reload
diff --git a/backend/app/main.py b/backend/app/main.py
@@ -0,0 +1,76 @@
+from typing import Optional
+
+from fastapi import FastAPI, HTTPException, Depends
+from pydantic import ValidationError
+from pydantic import BaseModel
+from config.utils import load_config
+from src.rag_pipeline.rag_system import RAGSystem
+from src.env_loader import load_api_keys
+
+load_api_keys()
+
+app = FastAPI()
+
+
+class InitializeRequest(BaseModel):
+    strategy_name: str
+    split_docs: Optional[int] = None
+
+
+class QueryRequest(BaseModel):
+    question: str
+
+
+# Global variable to store the initialized RAGSystem
+rag_system_instance: Optional[RAGSystem] = None
+
+
+# Dependency to get the initialized RAGSystem
+def get_rag_system():
+    if rag_system_instance is None:
+        raise HTTPException(status_code=500, detail="RAG system is not initialized")
+    return rag_system_instance
+
+
+@app.get("/")
+def read_root():
+    return {"message": "Hello from FastAPI backend!"}
+
+
+@app.get("/health")
+def health_check():
+    try:
+        rag_system = get_rag_system()
+        print(rag_system)
+        return {"status": "RAG system is initialized and running"}
+    except HTTPException as e:
+        return {"status": "RAG system is not initialized", "detail": str(e)}
+
+
+@app.post("/initialize")  # New endpoint for initialization
+def initialize_rag_system(init_request: InitializeRequest):
+    global rag_system_instance
+    try:
+        config = load_config(init_request.strategy_name)
+        rag_system_instance = RAGSystem(config=config)
+        rag_system_instance.initialize(init_request.split_docs)
+        return {
+            "message": f"RAG system initialized with strategy '{init_request.strategy_name}' and split_docs={init_request.split_docs}"
+        }
+    except ValidationError as e:
+        raise HTTPException(status_code=400, detail=f"Configuration error: {e}") from e
+    except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"Initialization failed: {e}"
+        ) from e
+
+
+@app.post("/query")
+def query_rag_system(
+    query_request: QueryRequest, rag_system: RAGSystem = Depends(get_rag_system)
+):
+    try:
+        answer = rag_system.query(query_request.question)
+        return {"answer": answer}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Query failed: {e}") from e
diff --git a/config/config_base.yaml b/config/config_base.yaml
@@ -1,7 +1,7 @@
 vectorstore:
   clear_store: true
   collection_name: "baseline_rag"
-  use_existing_vectorstore: true
+  use_existing_vectorstore: false
 chunking:
   chunk_type: recursive_character_splitter
   chunk_size: 1000

diff --git a/config/config_multiquery_500cs.yaml b/config/config_multiquery_500cs.yaml
@@ -0,0 +1,18 @@
+vectorstore:
+  clear_store: false
+  collection_name: 'chunk_1000_200_ada_002'
+  use_existing_vectorstore: true
+chunking:
+  chunk_type: recursive_character_splitter
+  chunk_size: 1000
+  chunk_overlap: 200
+retrieval:
+  k_documents: 5
+  use_ensemble: true
+  use_multiquery: false
+  use_reranker: false
+  use_cohere_reranker: false
+  top_n_ranked: 5
+models:
+  generator_model: 'gpt-4o'
+  queries_generator_model: 'gpt-3.5-turbo'
diff --git a/data/collection/cnn_dailymail_validation_subset.csv b/data/collection/cnn_dailymail_validation_subset.csv
diff --git a/data/evaluation_sets/evaluation_set_20d20.csv b/data/evaluation_sets/evaluation_set_20d20.csv
diff --git a/data/ragas_results/bm_baseline_benchmark_results.csv b/data/ragas_results/bm_baseline_benchmark_results.csv
diff --git a/data/ragas_results/bm_chunk_size_500_overlap_100_results.csv b/data/ragas_results/bm_chunk_size_500_overlap_100_results.csv
diff --git a/data/ragas_results/bm_cs500_overlap100_multiquery_results.csv b/data/ragas_results/bm_cs500_overlap100_multiquery_results.csv
diff --git a/data/ragas_results/bm_embedding_model_3_large_openai_results.csv b/data/ragas_results/bm_embedding_model_3_large_openai_results.csv
diff --git a/data/ragas_results/bm_embedding_model_allmpnetv2_results.csv b/data/ragas_results/bm_embedding_model_allmpnetv2_results.csv
diff --git a/data/ragas_results/bm_embedding_model_bge_large_results.csv b/data/ragas_results/bm_embedding_model_bge_large_results.csv
diff --git a/data/ragas_results/bm_embedding_model_bge_small_2_results.csv b/data/ragas_results/bm_embedding_model_bge_small_2_results.csv
diff --git a/data/ragas_results/bm_embedding_model_bge_small_results.csv b/data/ragas_results/bm_embedding_model_bge_small_results.csv
diff --git a/data/ragas_results/bm_ensemble_prompt_1_results.csv b/data/ragas_results/bm_ensemble_prompt_1_results.csv
diff --git a/data/ragas_results/bm_ensemble_retriever_with_bm25_results.csv b/data/ragas_results/bm_ensemble_retriever_with_bm25_results.csv
diff --git a/data/ragas_results/bm_multiquery_prompt_1_results.csv b/data/ragas_results/bm_multiquery_prompt_1_results.csv
diff --git a/data/ragas_results/bm_multiquery_retriever_results.csv b/data/ragas_results/bm_multiquery_retriever_results.csv
diff --git a/data/ragas_results/bm_reranker_opensource_model_msmacro_distilbert_results.csv b/data/ragas_results/bm_reranker_opensource_model_msmacro_distilbert_results.csv
diff --git a/misc/settings.py b/misc/settings.py
@@ -1,48 +1,49 @@
 import os
 from typing import Final
 
+
 class Settings:
     PG_CONNECTION_STRING: Final = os.getenv("PG_CONNECTION_STRING")
     COLLECTION_NAME: Final = "cnn_dailymail_validation_subset"
-    SOURCE_FILE_PATH: Final = "data/cnn_dailymail_validation_subset.csv"
+    SOURCE_FILE_PATH: Final = "data/collection/cnn_dailymail_validation_subset.csv"
     CHUNK_SIZE: Final = 1000
     CHUNK_OVERLAP: Final = 200
     PAGE_CONTENT_COLUMN: Final = "article"
-
-    GENERATOR_TEMPLATE: Final = """
-      Use the following pieces of context to answer the question at the end.
-      If you don't know the answer, just say that you don't know, don't try to make up an answer.
-      Use three sentences maximum and keep the answer as concise as possible.
-      Context: {context}
-      Question: {question}
-      Helpful Answer:
-    """
-
+
     # GENERATOR_TEMPLATE: Final = """
-    #     Use the following pieces of context to answer the question at the end. 
-    #     These are the instruction to consider:
-    #     - Prioritize accuracy and conciseness in your response.
-    #     - Answer directly and avoid repeating information from the question.
-    #     - If the context doesn't contain the answer, just say that "I don't know".
-    #     - Don't try to make up an answer.
-    #     - Limit your answer to three sentences maximum, but aim for two if possible.
-
-    #     Example:
-    #       Context: The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France. It is named after the engineer Gustave Eiffel, whose company designed and built the tower.
-
-    #       Question: Where is the Eiffel Tower located?
-    #       Answer: Paris, France
-
-    #     REMEMBER TO FOLLOW THE INSTRUCTIONS ABOVE.
-
-    #     Context: {context}
-    #     Question: {question}
-    #     Answer:
+    #   Use the following pieces of context to answer the question at the end.
+    #   If you don't know the answer, just say that you don't know, don't try to make up an answer.
+    #   Use three sentences maximum and keep the answer as concise as possible.
+    #   Context: {context}
+    #   Question: {question}
+    #   Helpful Answer:
     # """
 
+    GENERATOR_TEMPLATE: Final = """
+        Use the following pieces of context to answer the question at the end.
+        These are the instruction to consider:
+        - Prioritize accuracy and conciseness in your response.
+        - Answer directly and avoid repeating information from the question.
+        - If the context doesn't contain the answer, just say that "I don't know".
+        - Don't try to make up an answer.
+        - Limit your answer to three sentences maximum, but aim for two if possible.
+
+        Example:
+          Context: The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France. It is named after the engineer Gustave Eiffel, whose company designed and built the tower.
+
+          Question: Where is the Eiffel Tower located?
+          Answer: Paris, France
+
+        REMEMBER TO FOLLOW THE INSTRUCTIONS ABOVE.
+
+        Context: {context}
+        Question: {question}
+        Answer:
+    """
+
+    RESULTS_DIR: Final = "data/ragas_results"
     EVALUATION_FILE_PATH = "data/evaluation_sets/evaluation_set_20d20.csv"
     EVALUAION_DATASET_NAME: Final = "CNN DailyMail Evaluation Dataset"
     EVALUATION_DATASET_DESCRIPTION = """
       Evaluation dataset questions and ground truth answers for RAGAS   pipeline on cnn_dailymail dataset.
     """
-
diff --git a/notebooks/benchmark analysis/compare_benchmarks.ipynb b/notebooks/benchmark analysis/compare_benchmarks.ipynb
@@ -41,6 +41,157 @@
     "\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'answer_correctness': {'average': 0.6890095326155914, 'standard_deviation': 0.1892823422612965}, 'faithfulness': {'average': 0.8633333333333333, 'standard_deviation': 0.21599761748740176}, 'answer_relevancy': {'average': 0.8468700910459616, 'standard_deviation': 0.2921967841570098}, 'context_precision': {'average': 0.9799999999751596, 'standard_deviation': 0.042440188394626564}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "def calculate_metrics_statistics(df, numeric_columns):\n",
+    "  \"\"\"\n",
+    "  Calculate the averages and standard deviations of specified numeric columns in a DataFrame.\n",
+    "\n",
+    "  Parameters:\n",
+    "  df (pd.DataFrame): The DataFrame containing the data.\n",
+    "  numeric_columns (list): A list of column names for which to calculate statistics.\n",
+    "\n",
+    "  Returns:\n",
+    "  dict: A dictionary containing the averages and standard deviations for each column.\n",
+    "  \"\"\"\n",
+    "  statistics = {}\n",
+    "  \n",
+    "  for column in numeric_columns:\n",
+    "      if column in df.columns:\n",
+    "          avg = df[column].mean()\n",
+    "          std_dev = df[column].std()\n",
+    "          statistics[column] = {\n",
+    "              'average': avg,\n",
+    "              'standard_deviation': std_dev\n",
+    "          }\n",
+    "      else:\n",
+    "          statistics[column] = {\n",
+    "              'average': None,\n",
+    "              'std_dev': None,\n",
+    "              'error': f\"Column '{column}' not found in DataFrame.\"\n",
+    "          }\n",
+    "  \n",
+    "  return statistics\n",
+    "\n",
+    "# Example usage\n",
+    "# Assuming you have a DataFrame named 'results_df' with the relevant columns\n",
+    "numeric_columns = ['answer_correctness', 'faithfulness', 'answer_relevancy', 'context_precision']\n",
+    "# results_df = pd.DataFrame({\n",
+    "#   'answer_correctness': [0.712666, 0.998523, 0.618642, 0.785931, 0.844],\n",
+    "#   'faithfulness': [0.83333, 1, 0.2, 1, 1],\n",
+    "#   'answer_relevancy': [0.983714, 0.944596, 0.938077, 0.973125, 0.921761],\n",
+    "#   'context_precision': [1, 0.95, 1, 1, 1]\n",
+    "# })\n",
+    "results_df = pd.read_csv(baseline_filepath)\n",
+    "\n",
+    "statistics = calculate_metrics_statistics(results_df, numeric_columns)\n",
+    "print(statistics)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import json\n",
+    "import os\n",
+    "\n",
+    "NUMERICAL_COLUMNS = ['answer_correctness', 'faithfulness', 'answer_relevancy', 'context_precision']\n",
+    "\n",
+    "def update_json_with_metrics(csv_filename, json_filename, numeric_columns=NUMERICAL_COLUMNS):\n",
+    "  \"\"\"\n",
+    "  Read a CSV file, calculate averages and standard deviations for specified columns,\n",
+    "  and update or create a JSON file with the results.\n",
+    "\n",
+    "  Parameters:\n",
+    "  csv_filename (str): The name of the CSV file to read.\n",
+    "  numeric_columns (list): A list of column names for which to calculate statistics.\n",
+    "  json_filename (str): The name of the JSON file to update or create.\n",
+    "  \"\"\"\n",
+    "  # Load the CSV file\n",
+    "  df = pd.read_csv(csv_filename)\n",
+    "\n",
+    "  # Calculate averages and standard deviations\n",
+    "  statistics = {}\n",
+    "  for column in numeric_columns:\n",
+    "      if column in df.columns:\n",
+    "          avg = df[column].mean()\n",
+    "          std_dev = df[column].std()\n",
+    "          statistics[column] = {\n",
+    "              'average': avg,\n",
+    "              'std_dev': std_dev\n",
+    "          }\n",
+    "      else:\n",
+    "          statistics[column] = {\n",
+    "              'average': None,\n",
+    "              'std_dev': None,\n",
+    "              'error': f\"Column '{column}' not found in DataFrame.\"\n",
+    "          }\n",
+    "\n",
+    "  # Get the key from the filename (remove .csv)\n",
+    "  key_name = os.path.splitext(os.path.basename(csv_filename))[0]\n",
+    "\n",
+    "  # Load existing JSON data or create a new dictionary\n",
+    "  if os.path.exists(json_filename):\n",
+    "      with open(json_filename, 'r') as json_file:\n",
+    "          json_data = json.load(json_file)\n",
+    "  else:\n",
+    "      json_data = {}\n",
+    "\n",
+    "  # Update the JSON data with the new statistics\n",
+    "  json_data[key_name] = statistics\n",
+    "\n",
+    "  # Write the updated data back to the JSON file\n",
+    "  with open(json_filename, 'w') as json_file:\n",
+    "      json.dump(json_data, json_file, indent=4)\n",
+    "\n",
+    "# Example usage\n",
+    "# csv_filename = 'baseline_results.csv'  # Replace with your actual CSV file name\n",
+    "json_filename = 'benchmark_results.json'  # The JSON file to update/create\n",
+    "\n",
+    "update_json_with_metrics(bge_large, numeric_columns, json_filename)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def save_dict_to_json(dictionary, filename):\n",
+    "    \"\"\"\n",
+    "    Save a dictionary to a JSON file.\n",
+    "\n",
+    "    Parameters:\n",
+    "    dictionary (dict): The dictionary to save.\n",
+    "    filename (str): The name of the file to save the dictionary to.\n",
+    "    \"\"\"\n",
+    "    import json\n",
+    "\n",
+    "    with open(filename, 'w') as file:\n",
+    "        json.dump(dictionary, file)\n",
+    "        \n",
+    "# Example usage\n",
+    "# Assuming you have a dictionary named 'statistics' and a filename 'statistics.json'\n",
+    "save_dict_to_json(statistics, 'data/benchmark_results/statistics.json')"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 4,