From 79b1c6088d01982b5691518171ba98f9ff9d2639 Mon Sep 17 00:00:00 2001
From: "Huanzhi (Hans) Mao" <huanzhimao@gmail.com>
Date: Sat, 4 Jan 2025 14:03:59 +0800
Subject: [PATCH] [BFCL Chore] Ensure Correct Input Format for Eval Checker
 (#860)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In some cases, the model handler’s decode_ast method returns
successfully but produces output in an unexpected format, causing issues
in downstream evaluations that do not perform argument format
validation. This problem is especially common when the model does not
output any function calls, resulting in a human-readable string instead
of the expected structure.

This PR refines the `is_function_calling_format_output` function to
enforce that outputs must be a list of dictionaries in the following
format before calling the checker function:
```
[
  {func1: {param1: val1, param2: val2, ...}},
  {func2: {param1: val1, param2: val2, ...}},
  ...
]
```

Note: This PR will not affect the leaderboard score.
---
 .../bfcl/utils.py                             | 26 ++++++++++++++-----
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/berkeley-function-call-leaderboard/bfcl/utils.py b/berkeley-function-call-leaderboard/bfcl/utils.py
index a1fb4ccb4..5ac2969f5 100644
--- a/berkeley-function-call-leaderboard/bfcl/utils.py
+++ b/berkeley-function-call-leaderboard/bfcl/utils.py
@@ -138,13 +138,25 @@ def sort_key(entry):
 
 
 def is_function_calling_format_output(decoded_output):
-    # Ensure the output is a list of dictionaries
-    if type(decoded_output) == list:
-        for item in decoded_output:
-            if type(item) != dict:
-                return False
-        return True
-    return False
+    """
+    Ensure the output is a list of dictionaries of the form:
+    `[{func1: {param1: val1, param2: val2, ...}}, {func2: {param1: val1, param2: val2, ...}}, ...]`
+    Sometimes the model handler's `decode_ast` method will return successfully, but the output is not in the correct format, and that will mess up the downstream evaluation that expects this format.
+    This is especially the case when the model doesn't predict any function calls, and the output is an human-readable string.
+    Note: Empty list `[]` is considered the correct format in this check.
+    """
+    if type(decoded_output) != list:
+        return False
+    for item in decoded_output:
+        if type(item) != dict:
+            return False
+        # Check for `{func1: {param1: val1, param2: val2, ...}}`, should only have one key-value pair
+        if len(item) != 1:
+            return False
+        # Check for `{param1: val1, param2: val2, ...}`; the parameter-value pairs should be a dictionary
+        if type(list(item.values())[0]) != dict:
+            return False
+    return True
 
 
 def is_executable_format_output(decoded_output):