Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

check-playback: add check for FW performance #959

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions case-lib/hijack.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ trap 'func_exit_handler $?' EXIT
function func_exit_handler()
{
local exit_status=${1:-0}
local loggerBin wcLog; loggerBin=sof-logger
local mtraceBin; mtraceBin=mtrace-reader.py

# call trace
if [ "$exit_status" -ne 0 ] ; then
Expand All @@ -24,7 +26,7 @@ function func_exit_handler()
fi

# when sof logger collect is open
if [ "X$SOF_LOG_COLLECT" == "X1" ]; then
if [ "X$SOF_LOG_COLLECT" == "X1" ] && ! is_ipc4_zephyr; then
# when error occurs, exit and catch etrace log
[[ $exit_status -eq 1 ]] && {
func_lib_start_log_collect 1
Expand Down Expand Up @@ -54,7 +56,6 @@ function func_exit_handler()
# shellcheck disable=SC2154
logfile="$logfile"

local loggerBin wcLog; loggerBin=$(basename "$SOFLOGGER")
# We need this to avoid the confusion of a "Terminated" message
# without context.
dlogi "pkill -TERM $loggerBin"
Expand Down Expand Up @@ -100,6 +101,14 @@ function func_exit_handler()

fi

if is_ipc4_zephyr; then
dlogi "pkill -TERM -f $mtraceBin"
sudo pkill -TERM -f "$mtraceBin" || {
dloge "mtrace-reader.py was already dead"
exit_status=1
}
fi

stop_test || exit_status=1

if [ "$ENABLE_STORAGE_CHECKS" = '1' ]; then
Expand Down
128 changes: 88 additions & 40 deletions case-lib/lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,51 @@ find_ldc_file()
printf '%s' "$ldcFile"
}

func_mtrace_collect()
{
local clogfile=$LOG_ROOT/mtrace.txt

if [ -z "$MTRACE" ]; then
MTRACE=$(command -v mtrace-reader.py) || {
dlogw 'No mtrace-reader.py found in PATH'
return 1
}
fi

local mtraceCmd="$MTRACE"
dlogi "Starting ${mtraceCmd[*]}"
# Cleaned up by func_exit_handler() in hijack.sh
# shellcheck disable=SC2024
sudo "${mtraceCmd[@]}" >& "$clogfile" &
}

func_sof_logger_collect()
{
logfile=$1
logopt=$2
ldcFile=$(find_ldc_file) || return $?

if [ -z "$SOFLOGGER" ]; then
SOFLOGGER=$(command -v sof-logger) || {
dlogw 'No sof-logger found in PATH'
return 1
}
fi

test -x "$SOFLOGGER" || {
dlogw "$SOFLOGGER not found or not executable"
return 2
}

# The logger does not like empty '' arguments and $logopt can be
# shellcheck disable=SC2206
local loggerCmd=("$SOFLOGGER" $logopt -l "$ldcFile")
dlogi "Starting ${loggerCmd[*]}"
# Cleaned up by func_exit_handler() in hijack.sh
# shellcheck disable=SC2024
sudo "${loggerCmd[@]}" > "$logfile" &
}

SOF_LOG_COLLECT=0
# This function starts a logger in the background using '&'
#
Expand All @@ -315,45 +360,32 @@ SOF_LOG_COLLECT=0
func_lib_start_log_collect()
{
local is_etrace=${1:-0} ldcFile

ldcFile=$(find_ldc_file) || return $?

local logopt="-t"

if [ -z "$SOFLOGGER" ]; then
SOFLOGGER=$(command -v sof-logger) || {
dlogw 'No sof-logger found in PATH'
return 1
}
fi

test -x "$SOFLOGGER" || {
dlogw "$SOFLOGGER not found or not executable"
return 2
}

if [ "X$is_etrace" == "X0" ];then
logfile=$LOG_ROOT/slogger.txt
else
logfile=$LOG_ROOT/etrace.txt
logopt=""
fi
local log_file log_opt

if func_hijack_setup_sudo_level ;then
# shellcheck disable=SC2034 # external script will use it
SOF_LOG_COLLECT=1
else
>&2 dlogw "without sudo permission to run $SOFLOGGER command"
>&2 dlogw "without sudo permission to run logging command"
return 3
fi

# The logger does not like empty '' arguments and $logopt can be
# shellcheck disable=SC2206
local loggerCmd=("$SOFLOGGER" $logopt -l "$ldcFile")
dlogi "Starting ${loggerCmd[*]}"
# Cleaned up by func_exit_handler() in hijack.sh
# shellcheck disable=SC2024
sudo "${loggerCmd[@]}" > "$logfile" &
if [ "X$is_etrace" == "X0" ];then
if is_ipc4_zephyr; then
func_mtrace_collect
else
log_file=$LOG_ROOT/slogger.txt
log_opt="-t"
func_sof_logger_collect "$log_file" "$log_opt"
fi
else
is_ipc4_zephyr || {
log_file=$LOG_ROOT/etrace.txt
log_opt=""
func_sof_logger_collect "$log_file" "$log_opt"
}
fi

}

check_error_in_file()
Expand Down Expand Up @@ -667,6 +699,26 @@ ipc4_used()
return 0
}

is_ipc4_zephyr(){
# check if the ipc_type first
ipc4_used || return 1

local firmware_path znum
local firmware_name=dsp_basefw.bin
fw_mod_para=/sys/module/snd_sof_pci/parameters/fw_path

if [ ! -s "$fw_mod_para" ]; then
firmware_path=$(cat $fw_mod_para)
else
# TODO: let the kernel driver expose the FW path
# and get the FW path by grepping journalctl.
return 1
fi

znum=$(strings "/lib/firmware/$firmware_path/$firmware_name" | grep -c -i zephyr)
test "$znum" -gt 10
}

logger_disabled()
{
local ldcFile
Expand All @@ -689,16 +741,12 @@ logger_disabled()
return 0
fi

ipc4_used && {
# TODO:
# Need to remove disabling sof-logger
# after sof-logger support for IPC4 has been provided in the future
dlogi 'Currenly sof-logger is not supported when running IPC4 mode'
dlogi 'SOF logs collection is globally disabled because DUT is running IPC4 mode'
if ipc4_used && ! is_ipc4_zephyr; then
# mtrace and sof-logger are not supported for closed source IPC4 Firmware
dlogi 'IPC4 FW logging only support with SOF Zephyr build'
dlogi 'SOF logs collection is globally disabled.'
return 0
}
dlogi 'DUT is running IPC3 mode'

fi
return 1
}

Expand Down
9 changes: 9 additions & 0 deletions test-case/check-playback.sh
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,15 @@ do
func_lib_lsof_error_dump "$snd"
die "aplay on PCM $dev failed at $i/$loop_cnt."
}

tplg_basename=$(basename $tplg)
platform=$(sof-dump-status.py -p)
is_ipc4_zephyr && {
data_file=$LOG_ROOT/mtrace.txt
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK this new test is merely scanning logs so it should be hardcoded to neither IPC4 nor Zephyr nor mtrace.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@marc-hb Well that's complicated. The log output structure is specific to Zephyr (it uses formatting of the Zephyr logging subsystem). IPC4 should not be a dependency, but we do need separate code to handle sof-logger output.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One big TODO that needs to be addresses is limiting the analysis to also cover logs that happened during the test run. Here full mtrace.txt is analyzed, but it is not guaranteed the DSP went to D3 between the test iterations. So some kind of capture of the timestamp of last line of log line before test starts, and pruning the log to only cover the new entries added during test execution, is needed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems complicated...

  1. Can the analysis be ran last, after the log contributors have finished?
  2. Can the analysis be ran as a "follow," so it gathers updates automatically?
  3. Can the logs be backed up and purged after the initial run?

It's possible 3 may not work, as it may still present the same challenges.

test_reference_key="${platform}-${tplg_basename}-ipc4-zephyr-check-playback-${dev}"
TOPDIR="$(dirname "${BASH_SOURCE[0]}")"/..
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move TOPDIR at the top of the file (and then use it in source ... lib.sh). Copy what's already been done in many other scripts in this directory.

$TOPDIR/tools/sof-ll-timer-check.py ${data_file} $test_reference_key $TOPDIR/tools/sof-ll-timer-check-db.json
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now it's just in check-playback.sh but I expect this to be found in multiple tests in the future. So please some new check_firmware_load() (pick the name) shell function / indirection that can be quickly deployed across many tests, quickly turned on/off test-suite wide, easily switched to a less crude method that does not parse the logs, etc.

}
done
done
done
Expand Down
11 changes: 11 additions & 0 deletions tools/sof-ll-timer-check-db.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[
{
"test-key": "tgl-sof-tgl-nocodec.tplg-ipc4-zephyr-check-playback-hw:0,0" ,
"ll-timer-avg": 2589
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"ll-timer-avg": 2589
"expected-ll-timer-avg": 2589
Suggested change
"ll-timer-avg": 2589
"maximum-ll-timer-avg": 2589
Suggested change
"ll-timer-avg": 2589
"maximum-ll-timer-avg-before-error-margin": 2589

You get the idea.

},
{
"test-key": "tgl-sof-tgl-nocodec.tplg-ipc4-zephyr-check-playback-hw:0,1" ,
"ll-timer-avg": 2589
}
]

59 changes: 59 additions & 0 deletions tools/sof-ll-timer-check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env python3
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most of what this script does is parsing a log file but there's neither "parsing" nor "log" in the name. This is a test code so I don't think this is an "implementation detail" that should be hidden.


'''Script to analyze performance data in SOF FW log output'''
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


import sys
import re
import json
from statistics import median

# allowed error margin before error is raised for
# lower observed performance (1.05 -> 5% difference
# required to raise error)
AVG_ERROR_MARGIN = 1.05
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels overkill / YAGNI... why not just bump the thresholds 5%? There are only two of them.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@marc-hb I added this on purpose. Basicly anyone setting baseline values is going to run a test and copy values over to the database. Probably we add a option to the test script it self that if you pass "--reference-capture", script will run the test, measure performance, but instead of comparing against expected value, just write the measured values out in jsof/init format, so they can be incorporated in the database (after of course reviewed that values are a sane new baseline).

So to make above possible, it's better the margin is stored elsewhere.


max_vals = []
avg_vals = []
overruns = 0

f = open(sys.argv[1])

for line in f:
m = re.search('.*ll timer avg ([0-9]*), max ([0-9]*), overruns ([0-9]*)', line)
if m:
avg_vals.append(int(m.group(1)))
max_vals.append(int(m.group(2)))
overruns += int(m.group(3))

median_avg_vals = median(avg_vals)
print("Measurements:\t\t%d" % len(avg_vals))
print("Median avg reported:\t%d" % median_avg_vals)
print("Median max reported:\t%d" % median(max_vals))
print("Highest max reported:\t%d" % max(max_vals))
Copy link
Collaborator

@marc-hb marc-hb Sep 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

import logging
# Can be invoked only once and only before any actual logging
logging.basicConfig(level=logging.DEBUG)

...

logging.info(""Measurements:\t\t%d" % len(avg_vals)")

and that's it, nothing else needed. Fancy stuff can be more easily added later if/when needed. https://docs.python.org/3/howto/logging.html


if overruns:
print("ERROR: %s overruns detected" % overruns, file=sys.stderr)
sys.exit(-1)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
sys.exit(-1)
sys.exit("ERROR: %s overruns detected" % overruns")

prints the message on stderr and returns 1.


if len(sys.argv) < 4:
print("No reference data for key '%s', unable to check performance against reference")
sys.exit(0)

median_avg_ref = None
dbfile = open(sys.argv[3])
ref_key = sys.argv[2]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

argparse requires a tiny bit more effort than logging but it's really worth it. Don't procrastinate. git grep argparse will find multiple examples in this repo https://docs.python.org/3/howto/argparse.html

ref_data_all = json.load(dbfile)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you considered a .init file?
https://docs.python.org/3/library/configparser.html

It's less powerful and less flexible but more user-friendly and more programmer friendly.

Long story short: do you expect a hierarchy of values?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That should be considered, yeah. I frankly don't yet know whether even whole concept is pragmatic/feasible, so I just wanted to get some minimum-working-set and json was good for this.


for ref in ref_data_all:
if ref["test-key"] == ref_key:
median_avg_ref = ref["ll-timer-avg"]
break

if not median_avg_ref:
print("No reference data for key '%s', unable to check performance against reference" % ref_key)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You would probably need that code with a simpler .ini file, it would likely just fail with a meaning full stack trace and error message.

sys.exit(0)

median_avg_ref_w_margin = median_avg_ref * AVG_ERROR_MARGIN
if median_avg_vals > median_avg_ref_w_margin:
print("ERROR: ll-timer-avg median %d over threshold %d (%d without margin)" % (median_avg_vals, median_avg_ref_w_margin, median_avg_ref), file=sys.stderr)
sys.exit(-1)