feat(autofix): Split root cause context extraction and formatting (#898)

Separates the root cause analysis step into the context gathering + root cause deriving agent, then a separate formatting LLM call. Results in a much more reliable root cause analysis. Reduced errors in our eval set of `n=64` from 10 errors to 0. Quality is also improved in other scores as well: _(original run did not have weighted error score at the time)_ <img width="1439" alt="Screenshot 2024-07-13 at 1 14 24 AM" src="https://github.com/user-attachments/assets/595ac5c5-cf2b-40c2-affd-157fd3fa5adb"> <img width="1663" alt="Screenshot 2024-07-13 at 1 14 28 AM" src="https://github.com/user-attachments/assets/9a91a22b-90cc-4d98-805b-0f79fa3d2234"> ## Future Improvements The formatting LLM does not need to be the expensive `gpt4o`, we can use a cheaper model like claude 3 haiku in the near future.
getsentry · Jul 15, 2024 · 6298ba1 · 6298ba1
1 parent 0f71e94
commit 6298ba1
Show file tree

Hide file tree

Showing 4 changed files with 246 additions and 155 deletions.
diff --git a/src/seer/automation/autofix/components/root_cause/component.py b/src/seer/automation/autofix/components/root_cause/component.py
@@ -2,6 +2,7 @@
 from sentry_sdk.ai.monitoring import ai_track
 
 from seer.automation.agent.agent import GptAgent
+from seer.automation.agent.client import GptClient
 from seer.automation.agent.models import Message
 from seer.automation.autofix.autofix_context import AutofixContext
 from seer.automation.autofix.components.root_cause.models import (
@@ -13,7 +14,7 @@
 from seer.automation.autofix.tools import BaseTools
 from seer.automation.autofix.utils import autofix_logger
 from seer.automation.component import BaseComponent
-from seer.automation.utils import escape_multi_xml
+from seer.automation.utils import escape_multi_xml, extract_text_inside_tags
 
 
 class RootCauseAnalysisComponent(BaseComponent[RootCauseAnalysisRequest, RootCauseAnalysisOutput]):
@@ -27,6 +28,7 @@ def invoke(self, request: RootCauseAnalysisRequest) -> RootCauseAnalysisOutput |
         agent = GptAgent(
             tools=tools.get_tools(),
             memory=[Message(role="system", content=RootCauseAnalysisPrompts.format_system_msg())],
+            stop_message="<NO_ROOT_CAUSES>",
         )
 
         response = agent.run(
@@ -43,8 +45,29 @@ def invoke(self, request: RootCauseAnalysisRequest) -> RootCauseAnalysisOutput |
             autofix_logger.warning("Root Cause Analysis agent did not return a valid response")
             return None
 
+        if "<NO_ROOT_CAUSES>" in response:
+            return None
+
+        extracted_response = extract_text_inside_tags(response, "potential_root_causes")
+
+        formatter_response, formatter_usage = GptClient().completion(
+            [
+                Message(
+                    role="user",
+                    content=RootCauseAnalysisPrompts.root_cause_formatter_msg(extracted_response),
+                )
+            ]
+        )
+
+        with self.context.state.update() as cur:
+            cur.usage += formatter_usage
+
+        if not formatter_response.content:
+            autofix_logger.warning("Root Cause Analysis formatter did not return a valid response")
+            return None
+
         xml_response = RootCauseAnalysisOutputPromptXml.from_xml(
-            f"<root>{escape_multi_xml(response, ['thoughts', 'snippet', 'title', 'description'])}</root>"
+            f"<root>{escape_multi_xml(formatter_response.content, ['thoughts', 'snippet', 'title', 'description'])}</root>"
         )
 
         # Assign the ids to be the numerical indices of the causes and suggested fixes

diff --git a/src/seer/automation/autofix/components/root_cause/models.py b/src/seer/automation/autofix/components/root_cause/models.py
@@ -1,6 +1,8 @@
 from typing import Annotated, Optional
 
-from pydantic import BaseModel, StringConstraints
+from johen import gen
+from johen.examples import Examples
+from pydantic import BaseModel, Field, StringConstraints
 from pydantic_xml import attr, element
 
 from seer.automation.component import BaseComponentOutput, BaseComponentRequest
@@ -52,8 +54,8 @@ class RootCauseAnalysisItem(BaseModel):
     id: int = -1
     title: str
     description: str
-    likelihood: float
-    actionability: float
+    likelihood: Annotated[float, Examples(r.uniform(0, 1) for r in gen)] = Field(..., ge=0, le=1)
+    actionability: Annotated[float, Examples(r.uniform(0, 1) for r in gen)] = Field(..., ge=0, le=1)
     suggested_fixes: Optional[list[RootCauseSuggestedFix]] = None
 
 

diff --git a/src/seer/automation/autofix/components/root_cause/prompts.py b/src/seer/automation/autofix/components/root_cause/prompts.py
@@ -17,26 +17,11 @@ def format_system_msg():
 
             You have tools to search a codebase to find the root cause of an issue. Please use the tools as many times as you want to find the root cause of the issue. It is very important to be very detailed and clear in your output.
 
-            You must follow the below XML format in your final output:
-            {root_cause_output_example_str}
-
             Guidelines:
-            - The likelihood must be a float between 0 and 1. It represents the likelihood that the root cause is correct.
-            - The actionability must be a float between 0 and 1. It represents if a fix to this cause is actionable within this codebase.
-                - For example, if it's caused by a malformed API request, then it's not actionable in the codebase.
-                - If there is a clear code change that can be made to fix the issue, then it is actionable.
-                - If you do not have a clear code change that can be made to fix the issue, then it should be scored low.
-            - You can include suggested fixes if you have ones that are clear and actionable within the immediate codebase.
-                - In a suggested fix, suggest a fix with expert judgement, consider the implications of the fix, and the potential side effects.
-                - For example, simply raising an exception or assigning a default value may not be the best fix.
-                - The elegance of a fix is a float between 0 and 1. The higher the score the better the fix.
-                    - A fix by a staff engineer will have a high elegance score.
-                    - A fix by a junior engineer or intern will have a low elegance score.
-                - When you provide code snippets for a suggested fix, you must provide explicit code changes and file paths.
-            - Don't always assume the data being passed is correct, it could be incorrect! Sometimes the API request is malformed, or there is a bug on the client/server side that is causing the issue.
+            - Don't always assume data being passed is correct, it could be incorrect! Sometimes the API request is malformed, or there is a bug on the client/server side that is causing the issue.
             - You are not able to search in or make changes to external libraries. If the error is caused by an external library or the stacktrace only contains frames from external libraries, do not attempt to search or suggest changes in external libraries.
             - You must only suggest actionable fixes that can be made in the immediate workable codebase. Do not suggest fixes with code snippets in external libraries.
-            - If you are not able to find any potential root causes, you can return an empty <potential_root_causes></potential_root_causes> tag.
+            - If you are not able to find any potential root causes, return only <NO_ROOT_CAUSES>.
             - If multiple searches turn up no viable results, you should conclude the session.
 
             It is important that we find all the potential root causes of the issue, so provide as many possibilities as you can for the root cause, ordered from most likely to least likely."""
@@ -56,8 +41,35 @@ def format_default_msg(
 
             {instruction_str}
 
-            Think step-by-step in a <thoughts> block before returning the potential root causes of the issue inside a <potential_root_causes> block."""
+            Think step-by-step each time before using the tools provided to you.
+            Also think step-by-step before giving the final answer.
+
+            When ready with your final answer, detail all the potential root causes of the issue inside wrapped with a <potential_root_causes></potential_root_causes> block.
+            - Each root cause should be inside its own <root_cause> block with its suggested fix in there too.
+            - Include float values from 0.0-1.0 of the likelihood and actionability of each root cause.
+            - If there is a clear and obvious fix to a given root cause, suggest a fix and provide a code snippet if possible. Suggest as many fixes as you can.
+            - You MUST include the EXACT file name in the code snippets you provide. If you cannot, do not provide a code snippet."""
         ).format(
             error_str=event.format_event(),
             instruction_str=format_instruction(instruction),
         )
+
+    @staticmethod
+    def root_cause_formatter_msg(raw_root_cause_output: str):
+        return textwrap.dedent(
+            """\
+            Given the following root cause analysis:
+
+            {raw_root_cause_output}
+
+            Please format properly according to the following guidelines:
+
+            {root_cause_output_example_str}
+
+            Note: If the provided root cause analysis is not formatted properly, such as suggested fixes missing descriptions, you can derive them from the provided root cause analysis.
+
+            Return only the formatted root cause analysis:"""
+        ).format(
+            raw_root_cause_output=raw_root_cause_output,
+            root_cause_output_example_str=MultipleRootCauseAnalysisOutputPromptXml.get_example().to_prompt_str(),
+        )