From 09827b997dfff3843a0596769943fa1bbc94303f Mon Sep 17 00:00:00 2001 From: imda-amdlahir Date: Tue, 14 Jan 2025 12:27:27 +0800 Subject: [PATCH] fix report raw metrics rendering --- .../report/components/rawScoresTable.tsx | 67 +++++++++++++++---- .../report/types/benchmarkReportTypes.ts | 33 ++++++--- 2 files changed, 77 insertions(+), 23 deletions(-) diff --git a/app/benchmarking/report/components/rawScoresTable.tsx b/app/benchmarking/report/components/rawScoresTable.tsx index e54dd252..57f2ed7b 100644 --- a/app/benchmarking/report/components/rawScoresTable.tsx +++ b/app/benchmarking/report/components/rawScoresTable.tsx @@ -52,15 +52,46 @@ export function RawRecipeMetricsScoresTable({ {resultPromptData.map((promptData, idx) => { - let stringifiedMetrics = ''; - try { - stringifiedMetrics = JSON.stringify( - promptData.metrics, - null, - 2 + let stringifiedMetrics: string | React.ReactNode = ''; + let tooLong = false; + for (const metric of promptData.metrics) { + if ('grading_criteria' in metric) { + for (const [_, value] of Object.entries(metric)) { + if ( + typeof value === 'object' && + 'individual_scores' in value + ) { + if ( + (value.individual_scores.successful && + value.individual_scores.successful.length > 20) || + (value.individual_scores.unsuccessful && + value.individual_scores.unsuccessful.length > 20) + ) { + tooLong = true; + break; + } + } + } + } + } + if (tooLong) { + stringifiedMetrics = ( + <> + Metric result is too long to display.
+ Click on "Download Details Scoring JSON"
+ below to view raw scores. + ); - } catch (error) { - console.log(error); + } else { + try { + stringifiedMetrics = JSON.stringify( + promptData.metrics, + null, + 2 + ); + } catch (error) { + console.log(error); + } } return ( 0 && idx % rowCount === 0 ? 'break-before-page' : '' }> - {promptData.dataset_id} +
+ {promptData.dataset_id} +
{promptData.prompt_template_id} - + {recipe.metrics.map((metricName, idx) => { const name = idx < promptData.metrics.length - 1 ? `${metricName}, ` : metricName; - return {name}; + return ( + + {name} + + ); })} - -
{stringifiedMetrics}
+ +
+                      {stringifiedMetrics}
+                    
); diff --git a/app/benchmarking/report/types/benchmarkReportTypes.ts b/app/benchmarking/report/types/benchmarkReportTypes.ts index e0344ec1..c295c5de 100644 --- a/app/benchmarking/report/types/benchmarkReportTypes.ts +++ b/app/benchmarking/report/types/benchmarkReportTypes.ts @@ -49,18 +49,30 @@ type RecipePromptData = { duration: number; }; -type Metric = { +type MetricPromptAndScore = { + prompt: string; + predicted_value: string; + target: string; +}; + +type IndividualScore = { + individual_scores: { + successful?: MetricPromptAndScore[]; + unsuccessful?: MetricPromptAndScore[]; + }; +}; + +type GradingCriteria = { accuracy?: number; + attack_success_rate?: number; + toxicity_rate?: number; refusal_rate?: number; - safe?: number; - unsafe?: number; - unknown?: number; - grading_criteria: { - accuracy?: number; - attack_success_rate?: number; - toxicity_rate?: number; - refusal_rate?: number; - }; +}; + +type Metric = { + grading_criteria: GradingCriteria; +} & { + [key: string]: IndividualScore | number | GradingCriteria; }; type RougeScore = { @@ -111,4 +123,5 @@ export type { GradingScale, GradingColors, CookbookCategoryLabels, + IndividualScore, };