diff --git a/changelog/7423.improvement.md b/changelog/7423.improvement.md
new file mode 100644
index 000000000000..87fbbd6c66f5
--- /dev/null
+++ b/changelog/7423.improvement.md
@@ -0,0 +1 @@
+Use response selector keys (sub-intents) as labels for plotting the confusion matrix during NLU evaluation to improve readability.
\ No newline at end of file
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index bb51893e6f2a..1c307ad03a3a 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -31,6 +31,7 @@
     ENTITY_ATTRIBUTE_TYPE,
     ENTITY_ATTRIBUTE_GROUP,
     ENTITY_ATTRIBUTE_ROLE,
+    RESPONSE_KEY_ATTRIBUTE,
 )
 from rasa.model import get_model
 from rasa.nlu import config, training_data, utils
@@ -62,7 +63,7 @@
 
 ResponseSelectionEvaluationResult = namedtuple(
     "ResponseSelectionEvaluationResult",
-    "intent_target " "response_target " "response_prediction " "message " "confidence",
+    "response_key response_prediction_full_intent message confidence",
 )
 
 EntityEvaluationResult = namedtuple(
@@ -235,10 +236,10 @@ def remove_empty_response_examples(
     for r in response_results:
         # substitute None values with empty string
         # to enable sklearn evaluation
-        if r.response_prediction is None:
-            r = r._replace(response_prediction="")
+        if r.response_prediction_full_intent is None:
+            r = r._replace(response_prediction_full_intent="")
 
-        if r.response_target != "" and r.response_target is not None:
+        if r.response_key != "" and r.response_key is not None:
             filtered.append(r)
 
     return filtered
@@ -373,7 +374,7 @@ def evaluate_response_selections(
     )
 
     target_responses, predicted_responses = _targets_predictions_from(
-        response_selection_results, "response_target", "response_prediction"
+        response_selection_results, "response_key", "response_prediction_full_intent"
     )
 
     if report_folder:
@@ -409,9 +410,8 @@ def evaluate_response_selections(
     predictions = [
         {
             "text": res.message,
-            "intent_target": res.intent_target,
-            "response_target": res.response_target,
-            "response_predicted": res.response_prediction,
+            "response_key": res.response_key,
+            "response_predicted": res.response_prediction_full_intent,
             "confidence": res.confidence,
         }
         for res in response_selection_results
@@ -1050,13 +1050,16 @@ def get_eval_data(
                 response_prediction_key, {}
             ).get(OPEN_UTTERANCE_PREDICTION_KEY, {})
 
-            response_target = example.get("response", "")
+            response_prediction_full_intent = selector_properties.get(
+                response_prediction_key, {}
+            ).get("full_retrieval_intent", {})
+
+            response_key = example.get_combined_intent_response_key()
 
             response_selection_results.append(
                 ResponseSelectionEvaluationResult(
-                    intent_target,
-                    response_target,
-                    response_prediction.get("name"),
+                    response_key,
+                    response_prediction_full_intent,
                     result.get("text", {}),
                     response_prediction.get("confidence"),
                 )
@@ -1486,7 +1489,9 @@ def compute_metrics(
     response_selection_metrics = {}
     if response_selection_results:
         response_selection_metrics = _compute_metrics(
-            response_selection_results, "response_target", "response_prediction"
+            response_selection_results,
+            "response_key",
+            "response_prediction_full_intent",
         )
 
     return (
diff --git a/tests/nlu/test_evaluation.py b/tests/nlu/test_evaluation.py
index bbe52646dc80..dcd9025451c2 100644
--- a/tests/nlu/test_evaluation.py
+++ b/tests/nlu/test_evaluation.py
@@ -470,18 +470,10 @@ def test_response_evaluation_report(tmpdir_factory):
 
     response_results = [
         ResponseSelectionEvaluationResult(
-            "chitchat",
-            "It's sunny in Berlin",
-            "It's sunny in Berlin",
-            "What's the weather",
-            0.65432,
+            "weather/ask_weather", "weather/ask_weather", "What's the weather", 0.65432,
         ),
         ResponseSelectionEvaluationResult(
-            "chitchat",
-            "My name is Mr.bot",
-            "My name is Mr.bot",
-            "What's your name?",
-            0.98765,
+            "faq/ask_name", "faq/ask_name", "What's your name?", 0.98765,
         ),
     ]
 
@@ -499,14 +491,13 @@ def test_response_evaluation_report(tmpdir_factory):
 
     prediction = {
         "text": "What's your name?",
-        "intent_target": "chitchat",
-        "response_target": "My name is Mr.bot",
-        "response_predicted": "My name is Mr.bot",
+        "response_key": "faq/ask_name",
+        "response_predicted": "faq/ask_name",
         "confidence": 0.98765,
     }
 
     assert len(report.keys()) == 5
-    assert report["My name is Mr.bot"] == name_query_results
+    assert report["faq/ask_name"] == name_query_results
     assert result["predictions"][1] == prediction
 
 
@@ -590,23 +581,16 @@ def test_empty_intent_removal():
 
 def test_empty_response_removal():
     response_results = [
+        ResponseSelectionEvaluationResult(None, None, "What's the weather", 0.65432,),
         ResponseSelectionEvaluationResult(
-            "chitchat", None, "It's sunny in Berlin", "What's the weather", 0.65432
-        ),
-        ResponseSelectionEvaluationResult(
-            "chitchat",
-            "My name is Mr.bot",
-            "My name is Mr.bot",
-            "What's your name?",
-            0.98765,
+            "faq/ask_name", "faq/ask_name", "What's your name?", 0.98765,
         ),
     ]
     response_results = remove_empty_response_examples(response_results)
 
     assert len(response_results) == 1
-    assert response_results[0].intent_target == "chitchat"
-    assert response_results[0].response_target == "My name is Mr.bot"
-    assert response_results[0].response_prediction == "My name is Mr.bot"
+    assert response_results[0].response_key == "faq/ask_name"
+    assert response_results[0].response_prediction_full_intent == "faq/ask_name"
     assert response_results[0].confidence == 0.98765
     assert response_results[0].message == "What's your name?"