From b086ae5e0e5ffd25818885a08e4bb4aa20677306 Mon Sep 17 00:00:00 2001
From: Yutong Zhang <90831468+yutongzhang-microsoft@users.noreply.github.com>
Date: Tue, 31 Dec 2024 12:49:52 +0800
Subject: [PATCH] Implementation of "Impacted Area Based PR testing".  (#15666)

What is the motivation for this PR?
We introduce a new model of PR testing called "Impacted Area-Based PR Testing," designed to be time-efficient, cost-efficient, and highly flexible. The HLD is detailed in #14761, and this PR represents its implementation

How did you do it?
We redefine the scope of PR testing by impacted area, which means we will only run the test scripts really affected by the changes.

How did you verify/test it?
Test by pipeline.
---
 .../calculate-instance-numbers.yml            |  76 ++++++++++
 .../calculate_instance_number.py              | 137 ++++++++++++++++++
 .../impacted_area_testing/constant.py         |  28 ++++
 .../get-impacted-area.yml                     |  74 ++++++++++
 .../impacted_area_testing/get_test_scripts.py | 122 ++++++++++++++++
 .../run-test-elastictest-template.yml         |   2 +-
 azure-pipelines.yml                           |  33 +++++
 7 files changed, 471 insertions(+), 1 deletion(-)
 create mode 100644 .azure-pipelines/impacted_area_testing/calculate-instance-numbers.yml
 create mode 100644 .azure-pipelines/impacted_area_testing/calculate_instance_number.py
 create mode 100644 .azure-pipelines/impacted_area_testing/constant.py
 create mode 100644 .azure-pipelines/impacted_area_testing/get-impacted-area.yml
 create mode 100644 .azure-pipelines/impacted_area_testing/get_test_scripts.py

diff --git a/.azure-pipelines/impacted_area_testing/calculate-instance-numbers.yml b/.azure-pipelines/impacted_area_testing/calculate-instance-numbers.yml
new file mode 100644
index 00000000000..4c1e17fecd6
--- /dev/null
+++ b/.azure-pipelines/impacted_area_testing/calculate-instance-numbers.yml
@@ -0,0 +1,76 @@
+parameters:
+  - name: TOPOLOGY
+    type: string
+    default: ""
+
+  - name: BUILD_BRANCH
+    type: string
+    default: ""
+
+steps:
+- script: |
+    set -x
+
+    sudo apt-get update && sudo apt-get install -y jq
+
+    TEST_SCRIPTS=$(echo '$(TEST_SCRIPTS)' | jq -r -c '."${{ parameters.TOPOLOGY }}"')
+
+    if [[ $? -ne 0 ]]; then
+      echo "##vso[task.complete result=Failed;]Get test scripts of specfic topology fails."
+      exit 1
+    fi
+
+    SCRIPTS=$(echo "$TEST_SCRIPTS" | jq -r '. | join(",")')
+    echo -n "##vso[task.setvariable variable=SCRIPTS]$SCRIPTS"
+  displayName: "Get ${{ parameters.TOPOLOGY }} test scripts"
+
+- script: |
+    set -x
+
+    # Check if azure cli is installed. If not, try to install it
+    if ! command -v az; then
+      echo "Azure CLI is not installed. Trying to install it..."
+
+      echo "Get packages needed for the installation process"
+      sudo apt-get -o DPkg::Lock::Timeout=600 update
+      sudo apt-get -o DPkg::Lock::Timeout=600 -y install apt-transport-https ca-certificates curl gnupg lsb-release
+
+      echo "Download and install the Microsoft signing key"
+      sudo mkdir -p /etc/apt/keyrings
+      curl -sLS https://packages.microsoft.com/keys/microsoft.asc |
+        gpg --dearmor | sudo tee /etc/apt/keyrings/microsoft.gpg > /dev/null
+      sudo chmod go+r /etc/apt/keyrings/microsoft.gpg
+
+      echo "Add the Azure CLI software repository"
+      AZ_DIST=$(lsb_release -cs)
+      echo "Types: deb
+    URIs: https://packages.microsoft.com/repos/azure-cli/
+    Suites: ${AZ_DIST}
+    Components: main
+    Architectures: $(dpkg --print-architecture)
+    Signed-by: /etc/apt/keyrings/microsoft.gpg" | sudo tee /etc/apt/sources.list.d/azure-cli.sources
+
+      echo "Update repository information and install the azure-cli package"
+      sudo apt-get -o DPkg::Lock::Timeout=600 update
+      sudo apt-get -o DPkg::Lock::Timeout=600 -y install azure-cli
+    else
+      echo "Azure CLI is already installed"
+    fi
+  displayName: "Install azure-cli"
+
+- script: |
+    set -x
+
+    pip install azure-kusto-data
+    pip install azure-kusto-data azure-identity
+
+    INSTANCE_NUMBER=$(python ./.azure-pipelines/impacted_area_testing/calculate_instance_number.py --scripts $(SCRIPTS) --topology ${{ parameters.TOPOLOGY }} --branch ${{ parameters.BUILD_BRANCH }})
+
+    if [[ $? -ne 0 ]]; then
+      echo "##vso[task.complete result=Failed;]Get instances number fails."
+      exit 1
+    fi
+
+    echo "$INSTANCE_NUMBER"
+    echo -n "##vso[task.setvariable variable=INSTANCE_NUMBER]$INSTANCE_NUMBER"
+  displayName: "Calculate instance number"
diff --git a/.azure-pipelines/impacted_area_testing/calculate_instance_number.py b/.azure-pipelines/impacted_area_testing/calculate_instance_number.py
new file mode 100644
index 00000000000..259da937f34
--- /dev/null
+++ b/.azure-pipelines/impacted_area_testing/calculate_instance_number.py
@@ -0,0 +1,137 @@
+import os
+import argparse
+import math
+from constant import PR_CHECKER_TOPOLOGY_NAME, MAX_INSTANCE_NUMBER, MAX_GET_TOKEN_RETRY_TIMES
+from azure.kusto.data import KustoConnectionStringBuilder, KustoClient
+
+
+def parse_list_from_str(s):
+    # Since Azure Pipeline doesn't support to receive an empty parameter,
+    # We use ' ' as a magic code for empty parameter.
+    # So we should consider ' ' as en empty input.
+    if isinstance(s, str):
+        s = s.strip()
+    if not s:
+        return None
+    return [single_str.strip()
+            for single_str in s.split(',')
+            if single_str.strip()]
+
+
+def get_access_token():
+    managed_identity_id = os.environ.get("SONIC_AUTOMATION_UMI")
+
+    # 1. Run az login with re-try
+    az_login_cmd = f"az login --identity --username {managed_identity_id}"
+    az_login_attempts = 0
+    while az_login_attempts < MAX_GET_TOKEN_RETRY_TIMES:
+        try:
+            result = os.popen(az_login_cmd)
+            result.read()
+            break
+        except Exception as exception:
+            az_login_attempts += 1
+            raise Exception(
+                f"Failed to az login with exception: {repr(exception)}. "
+                f"Retry {MAX_GET_TOKEN_RETRY_TIMES - az_login_attempts} times to login."
+            )
+
+    # If az login failed, return with exception
+    if az_login_attempts >= MAX_GET_TOKEN_RETRY_TIMES:
+        raise Exception(f"Failed to az login after {MAX_GET_TOKEN_RETRY_TIMES} attempts.")
+
+    # 2. Get access token with re-try
+    get_token_cmd = "az account get-access-token --resource https://api.kusto.windows.net --query accessToken -o tsv"
+    get_token_attempts = 0
+    while get_token_attempts < MAX_GET_TOKEN_RETRY_TIMES:
+        try:
+            result = os.popen(get_token_cmd)
+            access_token = result.read()
+            if not access_token:
+                raise Exception("Parse token from stdout failed, accessToken is None.")
+
+            return access_token
+
+        except Exception as exception:
+            get_token_attempts += 1
+            raise Exception(f"Failed to get token with exception: {repr(exception)}.")
+
+    # If az get token failed, return with exception
+    if get_token_attempts >= MAX_GET_TOKEN_RETRY_TIMES:
+        raise Exception(f"Failed to get token after {MAX_GET_TOKEN_RETRY_TIMES} attempts")
+
+
+def main(scripts, topology, branch):
+    ingest_cluster = os.getenv("TEST_REPORT_QUERY_KUSTO_CLUSTER_BACKUP")
+    access_token = get_access_token()
+
+    if not ingest_cluster or not access_token:
+        raise RuntimeError(
+            "Could not load Kusto Credentials from environment")
+
+    try:
+        kcsb = KustoConnectionStringBuilder.with_aad_application_token_authentication(ingest_cluster,
+                                                                                      access_token)  # noqa F841
+        client = KustoClient(kcsb)
+    except Exception as e:
+        raise Exception("Connect to kusto fails, error {}".format(e))
+
+    scripts = parse_list_from_str(scripts)
+
+    scripts_running_time = {}
+    total_running_time = 0
+
+    for script in scripts:
+        # As baseline test is the universal set of PR test
+        # we get the historical running time of one script here
+        # We get recent 5 test plans and calculate the average running time
+        query = "V2TestCases " \
+                "| join kind=inner" \
+                "(TestPlans " \
+                "| where TestPlanType == 'PR' and Result == 'FINISHED' " \
+                f"and Topology == '{PR_CHECKER_TOPOLOGY_NAME[topology][0]}' " \
+                f"and TestBranch == '{branch}' and TestPlanName contains '{PR_CHECKER_TOPOLOGY_NAME[topology][1]}' " \
+                "and TestPlanName contains '_BaselineTest_' and UploadTime > ago(7d)" \
+                "| order by UploadTime desc) on TestPlanId " \
+                f"| where FilePath == '{script}' " \
+                "| where Result !in ('failure', 'error') " \
+                "| summarize ActualCount = count(), TotalRuntime = sum(Runtime)"
+
+        try:
+            response = client.execute("SonicTestData", query)
+        except Exception as e:
+            raise Exception("Query results from Kusto fails, error {}".format(e))
+
+        for row in response.primary_results[0]:
+            # We have obtained the results of the most recent five times.
+            # To get the result for a single time, we need to divide by five
+            # If response.primary_results is None, which means where is no historical data in Kusto,
+            # we will use the default 1800s for a script.
+            actual_count = row["ActualCount"]
+
+            # There is no relevant records in Kusto
+            if actual_count == 0:
+                average_running_time = 1800
+            else:
+                average_running_time = row["TotalRuntime"] / actual_count
+
+        total_running_time += average_running_time
+        scripts_running_time[script] = average_running_time
+    # Total running time is calculated by seconds, divide by 60 to get minutes
+    # For one instance, we plan to assign 90 minutes to run test scripts
+    # Obtain the number of instances by rounding up the calculation.
+    # To prevent unexpected situations, we set the maximum number of instance
+    print(min(math.ceil(total_running_time / 60 / 90), MAX_INSTANCE_NUMBER))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--topology", help="The topology of testplan", type=str, default="")
+    parser.add_argument("--scripts", help="Test scripts to be executed", type=str, default="")
+    parser.add_argument("--branch", help="Test branch", type=str, default="")
+    args = parser.parse_args()
+
+    scripts = args.scripts
+    topology = args.topology
+    branch = args.branch
+    main(scripts, topology, branch)
diff --git a/.azure-pipelines/impacted_area_testing/constant.py b/.azure-pipelines/impacted_area_testing/constant.py
new file mode 100644
index 00000000000..a9b1c5f8dce
--- /dev/null
+++ b/.azure-pipelines/impacted_area_testing/constant.py
@@ -0,0 +1,28 @@
+# Now, we only have below types of PR checker
+# - dpu
+# - dualtor-t0
+# - multi-asic-t1-lag
+# - t0
+# - t0-2vlans
+# - t0-sonic
+# - t1- lag
+PR_TOPOLOGY_TYPE = ["t0", "t0-2vlans", "t0-sonic", "t1", "t1-multi-asic", "dpu", "dualtor"]
+
+EXCLUDE_TEST_SCRIPTS = [
+    "test_posttest.py",
+    "test_pretest.py"
+]
+
+# The mapping of topology type in PR test and topology recorded in kusto and the name of PR test.
+PR_CHECKER_TOPOLOGY_NAME = {
+    "t0": ["t0", "_kvmtest-t0_"],
+    "t0-2vlans": ["t0", "_kvmtest-t0-2vlans_"],
+    "t0-sonic": ["t0-64-32", "_kvmtest-t0-sonic_"],
+    "t1": ["t1-lag", "_kvmtest-t1-lag_"],
+    "t1-multi-asic": ["t1-8-lag", "_kvmtest-multi-asic-t1-lag_"],
+    "dpu": ["dpu", "_kvmtest-dpu_"],
+    "dualtor": ["dualtor", "_kvmtest-dualtor-t0_"]
+}
+
+MAX_INSTANCE_NUMBER = 25
+MAX_GET_TOKEN_RETRY_TIMES = 3
diff --git a/.azure-pipelines/impacted_area_testing/get-impacted-area.yml b/.azure-pipelines/impacted_area_testing/get-impacted-area.yml
new file mode 100644
index 00000000000..4b133152156
--- /dev/null
+++ b/.azure-pipelines/impacted_area_testing/get-impacted-area.yml
@@ -0,0 +1,74 @@
+steps:
+- script: |
+    set -x
+
+    git fetch --all
+    DIFF_FOLDERS=$(git diff origin/master HEAD --name-only | xargs -n1 dirname | sort -u | tr '\n' ' ')
+
+    if [[ $? -ne 0 ]]; then
+      echo "##vso[task.complete result=Failed;]Get diff folders fails."
+      exit 1
+    else
+      echo -n "##vso[task.setvariable variable=DIFF_FOLDERS]$DIFF_FOLDERS"
+    fi
+
+  continueOnError: false
+  displayName: "Get diff folders"
+
+- script: |
+    set -x
+
+    pip install PyYAML
+    pip install natsort
+
+    sudo apt-get install -y jq
+
+    FINAL_FEATURES=""
+    IFS=' ' read -ra FEATURES_LIST <<< "$(DIFF_FOLDERS)"
+    for FEATURE in "${FEATURES_LIST[@]}"
+    do
+      # If changes contains the common part in tests folder,the scope of PR testing is all test scripts.
+      if [[ "$FEATURE" == *tests/common* ]]; then
+        FINAL_FEATURES=""
+        break
+
+      # If changes only limited to specific feature, the scope of PR testing is impacted area.
+      elif [[ "$FEATURE" =~ tests\/* ]]; then
+        # Cut the feature path
+        if [[ $FEATURE == */*/* ]]; then
+            FEATURE=$(echo "$FEATURE" | cut -d'/' -f1-2)
+        fi
+
+        if [[ -z "$FINAL_FEATURES" ]]; then
+          FINAL_FEATURES="${FEATURE#tests/}"
+        else
+          FINAL_FEATURES="$FINAL_FEATURES,${FEATURE#tests/}"
+        fi
+
+      # If changes related to other folders excpet tests, we also consider them as common part.
+      # The scope of PR testing is all test scripts.
+      else
+        FINAL_FEATURES=""
+        break
+      fi
+    done
+
+    TEST_SCRIPTS=$(python ./.azure-pipelines/impacted_area_testing/get_test_scripts.py --features ${FINAL_FEATURES} --location tests)
+
+    if [[ $? -ne 0 ]]; then
+      echo "##vso[task.complete result=Failed;]Get test scripts fails."
+      exit 1
+    fi
+
+    PR_CHECKERS=$(echo "${TEST_SCRIPTS}" | jq -c 'keys')
+
+    if [[ $? -ne 0 ]]; then
+      echo "##vso[task.complete result=Failed;]Get valid PR checkers fails."
+      exit 1
+    fi
+
+    echo "##vso[task.setvariable variable=PR_CHECKERS;isOutput=true]$PR_CHECKERS"
+    echo "##vso[task.setvariable variable=TEST_SCRIPTS;isOutput=true]$TEST_SCRIPTS"
+  name: SetVariableTask
+  continueOnError: false
+  displayName: "Get impacted area"
diff --git a/.azure-pipelines/impacted_area_testing/get_test_scripts.py b/.azure-pipelines/impacted_area_testing/get_test_scripts.py
new file mode 100644
index 00000000000..787dde48347
--- /dev/null
+++ b/.azure-pipelines/impacted_area_testing/get_test_scripts.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+
+"""
+    Scripts for getting test scripts in impacted area
+    Example:
+        python impacted_area_testing/get_test_scripts.py vrf,gnmi ../tests
+
+    It will get all test scripts in specific impacted area.
+"""
+import os
+import re
+import logging
+import json
+import argparse
+from natsort import natsorted
+from constant import PR_TOPOLOGY_TYPE, EXCLUDE_TEST_SCRIPTS
+
+
+def topo_name_to_type(topo_name):
+    pattern = re.compile(r'^(wan|wan-pub-isis|wan-com|wan-pub|wan-pub-cisco|wan-3link-tg|'
+                         r't0|t0-52|t0-mclag|mgmttor|m0|mc0|mx|'
+                         r't1|t1-lag|t1-56-lag|t1-64-lag|'
+                         r'ptf|fullmesh|dualtor|t2|tgen|multidut-tgen|dpu|any|snappi|util|'
+                         r't0-2vlans|t0-sonic|t1-multi-asic)$')
+    match = pattern.match(topo_name)
+    if match is None:
+        logging.warning("Unsupported testbed type - {}".format(topo_name))
+        return topo_name
+
+    topo_type = match.group()
+    if topo_type in ['mgmttor', 'm0', 'mc0', 'mx', 't0-52', 't0-mclag']:
+        # certain testbed types are in 't0' category with different names.
+        topo_type = 't0'
+    if topo_type in ['t1-lag', 't1-56-lag', 't1-64-lag']:
+        topo_type = 't1'
+    return topo_type
+
+
+def distribute_scripts_to_PR_checkers(match, script_name, test_scripts_per_topology_type):
+    for topology in match.group(1).split(","):
+        topology_mark = topology.strip().strip('"').strip("'")
+        if topology_mark == "any":
+            for key in ["t0", "t1"]:
+                if script_name not in test_scripts_per_topology_type[key]:
+                    test_scripts_per_topology_type[key].append(script_name)
+        else:
+            topology_type = topo_name_to_type(topology_mark)
+            if topology_type in test_scripts_per_topology_type \
+                    and script_name not in test_scripts_per_topology_type[topology_type]:
+                test_scripts_per_topology_type[topology_type].append(script_name)
+
+
+def collect_scripts_by_topology_type(features: str, location: str) -> dict:
+    """
+    This function collects all test scripts under the impacted area and category them by topology type.
+
+    Args:
+        Features: The impacted area defined by features
+        Location: The location of test scripts
+
+    Returns:
+        Dict: A dict of test scripts categorized by topology type.
+    """
+    # Recursively find all files starting with "test_" and ending with ".py"
+    # Note: The full path and name of files are stored in a list named "files"
+    scripts = []
+
+    # This is just for the first stage of rolling out
+    # To avoid the overuse of resource, we will ignore the PR which modifies the common part.
+    if features == "":
+        return {}
+
+    for feature in features.split(","):
+        feature_path = os.path.join(location, feature)
+        for root, dirs, script in os.walk(feature_path):
+            for s in script:
+                if s.startswith("test_") and s.endswith(".py"):
+                    scripts.append(os.path.join(root, s))
+    scripts = natsorted(scripts)
+
+    # Open each file and search for regex pattern
+    pattern = re.compile(r"[^@]pytest\.mark\.topology\(([^\)]*)\)")
+
+    # Init the dict to record the mapping of topology type and test scripts
+    test_scripts_per_topology_type = {}
+    for topology_type in PR_TOPOLOGY_TYPE:
+        test_scripts_per_topology_type[topology_type] = []
+
+    for s in scripts:
+        # Remove prefix from file name:
+        script_name = s[len(location) + 1:]
+        if script_name in EXCLUDE_TEST_SCRIPTS:
+            continue
+
+        try:
+            with open(s, 'r') as script:
+                for line in script:
+                    # Get topology type of script from mark `pytest.mark.topology`
+                    match = pattern.search(line)
+                    if match:
+                        distribute_scripts_to_PR_checkers(match, script_name, test_scripts_per_topology_type)
+                        break
+        except Exception as e:
+            raise Exception('Exception occurred while trying to get topology in {}, error {}'.format(s, e))
+
+    return {k: v for k, v in test_scripts_per_topology_type.items() if v}
+
+
+def main(features, location):
+    scripts_list = collect_scripts_by_topology_type(features, location)
+    print(json.dumps(scripts_list))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--features", help="Impacted area", nargs='?', const="", type=str, default="")
+    parser.add_argument("--location", help="The location of folder `tests`", type=str, default="")
+    args = parser.parse_args()
+
+    features = args.features
+    location = args.location
+    main(features, location)
diff --git a/.azure-pipelines/run-test-elastictest-template.yml b/.azure-pipelines/run-test-elastictest-template.yml
index 49220090daa..a5bca5f265c 100644
--- a/.azure-pipelines/run-test-elastictest-template.yml
+++ b/.azure-pipelines/run-test-elastictest-template.yml
@@ -230,7 +230,7 @@ steps:
     displayName: "Install azure-cli"
 
   - script: |
-      set -e
+      set -ex
 
       pip install PyYAML
 
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 12998b6d836..4fee71f3586 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -74,6 +74,39 @@ stages:
       value: $(Build.SourceBranchName)
 
   jobs:
+  - job: get_impacted_area
+    displayName: "Get impacted area"
+    timeoutInMinutes: 240
+    continueOnError: true
+    pool: sonic-ubuntu-1c
+    steps:
+      - template: .azure-pipelines/impacted_area_testing/get-impacted-area.yml
+
+  - job: impacted_area_t0_elastictest
+    displayName: "impacted-area-kvmtest-t0 by Elastictest - optional"
+    dependsOn: get_impacted_area
+    condition: contains(dependencies.get_impacted_area.outputs['SetVariableTask.PR_CHECKERS'], 't0')
+    variables:
+      TEST_SCRIPTS: $[ dependencies.get_impacted_area.outputs['SetVariableTask.TEST_SCRIPTS'] ]
+    timeoutInMinutes: 240
+    continueOnError: true
+    pool: sonic-ubuntu-1c
+    steps:
+      - template: .azure-pipelines/impacted_area_testing/calculate-instance-numbers.yml
+        parameters:
+          TOPOLOGY: t0
+          BUILD_BRANCH: $(BUILD_BRANCH)
+
+      - template: .azure-pipelines/run-test-elastictest-template.yml
+        parameters:
+          TOPOLOGY: t0
+          SCRIPTS: $(SCRIPTS)
+          MIN_WORKER: $(INSTANCE_NUMBER)
+          MAX_WORKER: $(INSTANCE_NUMBER)
+          KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
+          MGMT_BRANCH: $(BUILD_BRANCH)
+          STOP_ON_FAILURE: "False"
+
   - job: t0_elastictest
     displayName: "kvmtest-t0 by Elastictest"
     timeoutInMinutes: 240