Skip to content

Commit

Permalink
Implementation of "Impacted Area Based PR testing". (#15666)
Browse files Browse the repository at this point in the history
What is the motivation for this PR?
We introduce a new model of PR testing called "Impacted Area-Based PR Testing," designed to be time-efficient, cost-efficient, and highly flexible. The HLD is detailed in #14761, and this PR represents its implementation

How did you do it?
We redefine the scope of PR testing by impacted area, which means we will only run the test scripts really affected by the changes.

How did you verify/test it?
Test by pipeline.
  • Loading branch information
yutongzhang-microsoft authored Dec 31, 2024
1 parent 1574885 commit b086ae5
Show file tree
Hide file tree
Showing 7 changed files with 471 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
parameters:
- name: TOPOLOGY
type: string
default: ""

- name: BUILD_BRANCH
type: string
default: ""

steps:
- script: |
set -x
sudo apt-get update && sudo apt-get install -y jq
TEST_SCRIPTS=$(echo '$(TEST_SCRIPTS)' | jq -r -c '."${{ parameters.TOPOLOGY }}"')
if [[ $? -ne 0 ]]; then
echo "##vso[task.complete result=Failed;]Get test scripts of specfic topology fails."
exit 1
fi
SCRIPTS=$(echo "$TEST_SCRIPTS" | jq -r '. | join(",")')
echo -n "##vso[task.setvariable variable=SCRIPTS]$SCRIPTS"
displayName: "Get ${{ parameters.TOPOLOGY }} test scripts"

- script: |
set -x
# Check if azure cli is installed. If not, try to install it
if ! command -v az; then
echo "Azure CLI is not installed. Trying to install it..."
echo "Get packages needed for the installation process"
sudo apt-get -o DPkg::Lock::Timeout=600 update
sudo apt-get -o DPkg::Lock::Timeout=600 -y install apt-transport-https ca-certificates curl gnupg lsb-release
echo "Download and install the Microsoft signing key"
sudo mkdir -p /etc/apt/keyrings
curl -sLS https://packages.microsoft.com/keys/microsoft.asc |
gpg --dearmor | sudo tee /etc/apt/keyrings/microsoft.gpg > /dev/null
sudo chmod go+r /etc/apt/keyrings/microsoft.gpg
echo "Add the Azure CLI software repository"
AZ_DIST=$(lsb_release -cs)
echo "Types: deb
URIs: https://packages.microsoft.com/repos/azure-cli/
Suites: ${AZ_DIST}
Components: main
Architectures: $(dpkg --print-architecture)
Signed-by: /etc/apt/keyrings/microsoft.gpg" | sudo tee /etc/apt/sources.list.d/azure-cli.sources
echo "Update repository information and install the azure-cli package"
sudo apt-get -o DPkg::Lock::Timeout=600 update
sudo apt-get -o DPkg::Lock::Timeout=600 -y install azure-cli
else
echo "Azure CLI is already installed"
fi
displayName: "Install azure-cli"

- script: |
set -x
pip install azure-kusto-data
pip install azure-kusto-data azure-identity
INSTANCE_NUMBER=$(python ./.azure-pipelines/impacted_area_testing/calculate_instance_number.py --scripts $(SCRIPTS) --topology ${{ parameters.TOPOLOGY }} --branch ${{ parameters.BUILD_BRANCH }})
if [[ $? -ne 0 ]]; then
echo "##vso[task.complete result=Failed;]Get instances number fails."
exit 1
fi
echo "$INSTANCE_NUMBER"
echo -n "##vso[task.setvariable variable=INSTANCE_NUMBER]$INSTANCE_NUMBER"
displayName: "Calculate instance number"
137 changes: 137 additions & 0 deletions .azure-pipelines/impacted_area_testing/calculate_instance_number.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import os
import argparse
import math
from constant import PR_CHECKER_TOPOLOGY_NAME, MAX_INSTANCE_NUMBER, MAX_GET_TOKEN_RETRY_TIMES
from azure.kusto.data import KustoConnectionStringBuilder, KustoClient


def parse_list_from_str(s):
# Since Azure Pipeline doesn't support to receive an empty parameter,
# We use ' ' as a magic code for empty parameter.
# So we should consider ' ' as en empty input.
if isinstance(s, str):
s = s.strip()
if not s:
return None
return [single_str.strip()
for single_str in s.split(',')
if single_str.strip()]


def get_access_token():
managed_identity_id = os.environ.get("SONIC_AUTOMATION_UMI")

# 1. Run az login with re-try
az_login_cmd = f"az login --identity --username {managed_identity_id}"
az_login_attempts = 0
while az_login_attempts < MAX_GET_TOKEN_RETRY_TIMES:
try:
result = os.popen(az_login_cmd)
result.read()
break
except Exception as exception:
az_login_attempts += 1
raise Exception(
f"Failed to az login with exception: {repr(exception)}. "
f"Retry {MAX_GET_TOKEN_RETRY_TIMES - az_login_attempts} times to login."
)

# If az login failed, return with exception
if az_login_attempts >= MAX_GET_TOKEN_RETRY_TIMES:
raise Exception(f"Failed to az login after {MAX_GET_TOKEN_RETRY_TIMES} attempts.")

# 2. Get access token with re-try
get_token_cmd = "az account get-access-token --resource https://api.kusto.windows.net --query accessToken -o tsv"
get_token_attempts = 0
while get_token_attempts < MAX_GET_TOKEN_RETRY_TIMES:
try:
result = os.popen(get_token_cmd)
access_token = result.read()
if not access_token:
raise Exception("Parse token from stdout failed, accessToken is None.")

return access_token

except Exception as exception:
get_token_attempts += 1
raise Exception(f"Failed to get token with exception: {repr(exception)}.")

# If az get token failed, return with exception
if get_token_attempts >= MAX_GET_TOKEN_RETRY_TIMES:
raise Exception(f"Failed to get token after {MAX_GET_TOKEN_RETRY_TIMES} attempts")


def main(scripts, topology, branch):
ingest_cluster = os.getenv("TEST_REPORT_QUERY_KUSTO_CLUSTER_BACKUP")
access_token = get_access_token()

if not ingest_cluster or not access_token:
raise RuntimeError(
"Could not load Kusto Credentials from environment")

try:
kcsb = KustoConnectionStringBuilder.with_aad_application_token_authentication(ingest_cluster,
access_token) # noqa F841
client = KustoClient(kcsb)
except Exception as e:
raise Exception("Connect to kusto fails, error {}".format(e))

scripts = parse_list_from_str(scripts)

scripts_running_time = {}
total_running_time = 0

for script in scripts:
# As baseline test is the universal set of PR test
# we get the historical running time of one script here
# We get recent 5 test plans and calculate the average running time
query = "V2TestCases " \
"| join kind=inner" \
"(TestPlans " \
"| where TestPlanType == 'PR' and Result == 'FINISHED' " \
f"and Topology == '{PR_CHECKER_TOPOLOGY_NAME[topology][0]}' " \
f"and TestBranch == '{branch}' and TestPlanName contains '{PR_CHECKER_TOPOLOGY_NAME[topology][1]}' " \
"and TestPlanName contains '_BaselineTest_' and UploadTime > ago(7d)" \
"| order by UploadTime desc) on TestPlanId " \
f"| where FilePath == '{script}' " \
"| where Result !in ('failure', 'error') " \
"| summarize ActualCount = count(), TotalRuntime = sum(Runtime)"

try:
response = client.execute("SonicTestData", query)
except Exception as e:
raise Exception("Query results from Kusto fails, error {}".format(e))

for row in response.primary_results[0]:
# We have obtained the results of the most recent five times.
# To get the result for a single time, we need to divide by five
# If response.primary_results is None, which means where is no historical data in Kusto,
# we will use the default 1800s for a script.
actual_count = row["ActualCount"]

# There is no relevant records in Kusto
if actual_count == 0:
average_running_time = 1800
else:
average_running_time = row["TotalRuntime"] / actual_count

total_running_time += average_running_time
scripts_running_time[script] = average_running_time
# Total running time is calculated by seconds, divide by 60 to get minutes
# For one instance, we plan to assign 90 minutes to run test scripts
# Obtain the number of instances by rounding up the calculation.
# To prevent unexpected situations, we set the maximum number of instance
print(min(math.ceil(total_running_time / 60 / 90), MAX_INSTANCE_NUMBER))


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--topology", help="The topology of testplan", type=str, default="")
parser.add_argument("--scripts", help="Test scripts to be executed", type=str, default="")
parser.add_argument("--branch", help="Test branch", type=str, default="")
args = parser.parse_args()

scripts = args.scripts
topology = args.topology
branch = args.branch
main(scripts, topology, branch)
28 changes: 28 additions & 0 deletions .azure-pipelines/impacted_area_testing/constant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Now, we only have below types of PR checker
# - dpu
# - dualtor-t0
# - multi-asic-t1-lag
# - t0
# - t0-2vlans
# - t0-sonic
# - t1- lag
PR_TOPOLOGY_TYPE = ["t0", "t0-2vlans", "t0-sonic", "t1", "t1-multi-asic", "dpu", "dualtor"]

EXCLUDE_TEST_SCRIPTS = [
"test_posttest.py",
"test_pretest.py"
]

# The mapping of topology type in PR test and topology recorded in kusto and the name of PR test.
PR_CHECKER_TOPOLOGY_NAME = {
"t0": ["t0", "_kvmtest-t0_"],
"t0-2vlans": ["t0", "_kvmtest-t0-2vlans_"],
"t0-sonic": ["t0-64-32", "_kvmtest-t0-sonic_"],
"t1": ["t1-lag", "_kvmtest-t1-lag_"],
"t1-multi-asic": ["t1-8-lag", "_kvmtest-multi-asic-t1-lag_"],
"dpu": ["dpu", "_kvmtest-dpu_"],
"dualtor": ["dualtor", "_kvmtest-dualtor-t0_"]
}

MAX_INSTANCE_NUMBER = 25
MAX_GET_TOKEN_RETRY_TIMES = 3
74 changes: 74 additions & 0 deletions .azure-pipelines/impacted_area_testing/get-impacted-area.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
steps:
- script: |
set -x
git fetch --all
DIFF_FOLDERS=$(git diff origin/master HEAD --name-only | xargs -n1 dirname | sort -u | tr '\n' ' ')
if [[ $? -ne 0 ]]; then
echo "##vso[task.complete result=Failed;]Get diff folders fails."
exit 1
else
echo -n "##vso[task.setvariable variable=DIFF_FOLDERS]$DIFF_FOLDERS"
fi
continueOnError: false
displayName: "Get diff folders"

- script: |
set -x
pip install PyYAML
pip install natsort
sudo apt-get install -y jq
FINAL_FEATURES=""
IFS=' ' read -ra FEATURES_LIST <<< "$(DIFF_FOLDERS)"
for FEATURE in "${FEATURES_LIST[@]}"
do
# If changes contains the common part in tests folder,the scope of PR testing is all test scripts.
if [[ "$FEATURE" == *tests/common* ]]; then
FINAL_FEATURES=""
break
# If changes only limited to specific feature, the scope of PR testing is impacted area.
elif [[ "$FEATURE" =~ tests\/* ]]; then
# Cut the feature path
if [[ $FEATURE == */*/* ]]; then
FEATURE=$(echo "$FEATURE" | cut -d'/' -f1-2)
fi
if [[ -z "$FINAL_FEATURES" ]]; then
FINAL_FEATURES="${FEATURE#tests/}"
else
FINAL_FEATURES="$FINAL_FEATURES,${FEATURE#tests/}"
fi
# If changes related to other folders excpet tests, we also consider them as common part.
# The scope of PR testing is all test scripts.
else
FINAL_FEATURES=""
break
fi
done
TEST_SCRIPTS=$(python ./.azure-pipelines/impacted_area_testing/get_test_scripts.py --features ${FINAL_FEATURES} --location tests)
if [[ $? -ne 0 ]]; then
echo "##vso[task.complete result=Failed;]Get test scripts fails."
exit 1
fi
PR_CHECKERS=$(echo "${TEST_SCRIPTS}" | jq -c 'keys')
if [[ $? -ne 0 ]]; then
echo "##vso[task.complete result=Failed;]Get valid PR checkers fails."
exit 1
fi
echo "##vso[task.setvariable variable=PR_CHECKERS;isOutput=true]$PR_CHECKERS"
echo "##vso[task.setvariable variable=TEST_SCRIPTS;isOutput=true]$TEST_SCRIPTS"
name: SetVariableTask
continueOnError: false
displayName: "Get impacted area"
Loading

0 comments on commit b086ae5

Please sign in to comment.