From 330c060c838181ceb342ae4be4cfacc18cf988b2 Mon Sep 17 00:00:00 2001 From: milan-zededa <83634241+milan-zededa@users.noreply.github.com> Date: Wed, 4 Oct 2023 13:56:06 +0200 Subject: [PATCH] Collect EVE info via console if ssh access is not working (#895) We use collect-info script to take a snapshot of the EVE state when tests fail and publish them alongside logs. Previously, we relied on the ssh access to produce and download EVE info tarball. However, this is not available when device is failing to onboard or if there are some networking issues. In that case, we can make (a rather desperate) attempt to get debug info over the serial console. Signed-off-by: Milan Lenco --- .github/actions/collect-info/action.yml | 16 +++---- .github/workflows/eden.yml | 13 ++---- pkg/controller/functions.go | 1 + shell-scripts/collect-info-console.sh | 58 +++++++++++++++++++++++++ shell-scripts/collect-info-ssh.sh | 46 ++++++++++++++++++++ 5 files changed, 114 insertions(+), 20 deletions(-) create mode 100755 shell-scripts/collect-info-console.sh create mode 100755 shell-scripts/collect-info-ssh.sh diff --git a/.github/actions/collect-info/action.yml b/.github/actions/collect-info/action.yml index 289525008..a8e6f3839 100644 --- a/.github/actions/collect-info/action.yml +++ b/.github/actions/collect-info/action.yml @@ -1,20 +1,14 @@ name: 'Collect and store debug info' -description: 'Collect debug info using EVE script executed via ssh and store downloaded tarball under the specified file name' +description: 'Collect debug info using EVE script executed via ssh or console and store downloaded tarball under the specified file name' runs: using: 'composite' steps: - name: Collect info run: | - # Give EVE 5 minutes at most to enable ssh access (if tests failed early). - for i in $(seq 60); do ./eden eve ssh && break || sleep 5; done - ./eden sdn fwd eth0 22 --\ - ssh -o StrictHostKeyChecking=no -p FWD_PORT -i ./dist/default-certs/id_rsa root@FWD_IP collect-info.sh &&\ - ./eden sdn fwd eth0 22 --\ - scp -o StrictHostKeyChecking=no -P FWD_PORT -i ./dist/default-certs/id_rsa root@FWD_IP:/persist/eve-info-* . &&\ - # upload-artifact complains about colon in the file name - # make sure to update upload step if changing name - mv eve-info-* eve-info.tar.gz ||\ - echo "failed to collect info" + # Do not pollute console logs which are collected by publish-logs action. + cp dist/default-eve.log dist/default-eve.log.backup || true + ./shell-scripts/collect-info-ssh.sh || ./shell-scripts/collect-info-console.sh 120 + cp dist/default-eve.log.backup dist/default-eve.log || true shell: bash working-directory: "./eden" diff --git a/.github/workflows/eden.yml b/.github/workflows/eden.yml index 4b4e83dc1..04defea55 100644 --- a/.github/workflows/eden.yml +++ b/.github/workflows/eden.yml @@ -60,15 +60,10 @@ jobs: - name: Collect info if: ${{ failure() }} run: | - # Give EVE 5 minutes at most to enable ssh access (if tests failed early). - for i in $(seq 60); do ./eden eve ssh && break || sleep 5; done - ./eden sdn fwd eth0 22 --\ - ssh -o StrictHostKeyChecking=no -p FWD_PORT -i ./dist/default-certs/id_rsa root@FWD_IP collect-info.sh &&\ - ./eden sdn fwd eth0 22 --\ - scp -o StrictHostKeyChecking=no -P FWD_PORT -i ./dist/default-certs/id_rsa root@FWD_IP:/persist/eve-info-* . &&\ - # upload-artifact complains about colon in the file name - mv eve-info-* eve-info.tar.gz ||\ - echo "failed to collect info" + # Do not pollute console logs which are collected by publish-logs action. + cp dist/default-eve.log dist/default-eve.log.backup || true + ./shell-scripts/collect-info-ssh.sh || ./shell-scripts/collect-info-console.sh 120 + cp dist/default-eve.log.backup dist/default-eve.log || true - name: Collect logs if: ${{ always() }} run: | diff --git a/pkg/controller/functions.go b/pkg/controller/functions.go index 991d5f2ba..d528c0729 100644 --- a/pkg/controller/functions.go +++ b/pkg/controller/functions.go @@ -127,6 +127,7 @@ func (cloud *CloudCtx) OnBoardDev(node *device.Ctx) error { node.SetConfigItem("app.allow.vnc", "true") node.SetConfigItem("newlog.allow.fastupload", "true") node.SetConfigItem("timer.download.retry", "60") + node.SetConfigItem("debug.enable.console", "true") // TODO: allow to enable/disable: //node.SetConfigItem("network.fallback.any.eth", "disabled") log.Debugf("will apply devModel %s", node.GetDevModel()) diff --git a/shell-scripts/collect-info-console.sh b/shell-scripts/collect-info-console.sh new file mode 100755 index 000000000..8e8308d46 --- /dev/null +++ b/shell-scripts/collect-info-console.sh @@ -0,0 +1,58 @@ +#!/bin/sh + +# This script runs collect-info.sh on EVE VM and downloads produced tarball +# using only serial console. This is especially useful when networking +# on the virtualized EVE is not working and therefore collect-info-ssh.sh +# is unable to do the same via SSH tunnel. + +# Use output filename without colon, otherwise Github action "upload-artifact" complains. +OUTPUT="eve-info.tar.gz" + +# 20 seconds should be enough for collect-info.sh to prepare tarball with debug info +# if run locally on a solid machine. However, on Github runners, it can take up to 2 minutes +# to complete (which is what we set from Github actions). +WAIT_TIME="${1:-20}" + +# Switch to debug container where collect-info.sh is installed. +for i in $(seq 3); do + { + echo "eve verbose off"; echo "eve enter debug"; sleep 3; + echo "which collect-info.sh"; sleep 3 + } | telnet localhost 7777 | tee telnet.stdout + grep -q "/usr/bin/collect-info.sh" telnet.stdout && break + sleep 60 +done + +for i in $(seq 3); do + { + echo "rm -f /persist/eve-info*"; echo "/usr/bin/collect-info.sh"; + sleep $((WAIT_TIME+60*(i-1))) + } | telnet localhost 7777 | tee telnet.stdout + TGZNAME="$(sed -n "s/EVE info is collected '\(.*\)'/\1/p" telnet.stdout)" + [ -n "${TGZNAME}" ] && break +done + +if [ -z "${TGZNAME}" ]; then + echo "Failed to run collect-info.sh script" + exit 1 +fi + +for i in $(seq 3); do + { + # Filename does not fit on one console line, we have to use asterisk. + echo "echo \>\>\>\$(base64 -w 0 /persist/eve-info*)\<\<\<"; + # This is fairly quick even on Github runners - around 10 seconds, but depends + # on the tarball size. + sleep $((20+60*(i-1))) + } | telnet localhost 7777 | sed -n "s/>>>\(.*\)<< "${OUTPUT}" + [ -s "${OUTPUT}" ] && break + echo "Failed to receive eve-info tarball, retrying..." +done + +if [ ! -s "${OUTPUT}" ]; then + echo "Failed to receive eve-info" + exit 1 +fi + +FILESIZE="$(stat -c%s "$OUTPUT")" +echo "Received ${OUTPUT} with size ${FILESIZE}" \ No newline at end of file diff --git a/shell-scripts/collect-info-ssh.sh b/shell-scripts/collect-info-ssh.sh new file mode 100755 index 000000000..9e7730235 --- /dev/null +++ b/shell-scripts/collect-info-ssh.sh @@ -0,0 +1,46 @@ +#/bin/sh + +# Use output filename without colon, otherwise Github action "upload-artifact" complains. +OUTPUT="eve-info.tar.gz" + +ssh() { + ./eden sdn fwd eth0 22 --\ + ssh -o StrictHostKeyChecking=no -p FWD_PORT -i ./dist/default-certs/id_rsa root@FWD_IP "$@" +} + +scp() { + ./eden sdn fwd eth0 22 --\ + scp -o StrictHostKeyChecking=no -P FWD_PORT -i ./dist/default-certs/id_rsa root@FWD_IP:$1 $2 +} + +if ./eden eve status | grep -q "no onboarded EVE"; then + echo "Cannot get eve-info via SSH from non-onboarded EVE VM" + exit 1 +fi + +# Give EVE 5 minutes at most to enable ssh access. +# This delay is typically needed if tests failed early. +for i in $(seq 60); do + ./eden eve ssh : && break || sleep 5 +done + +ssh collect-info.sh | tee ssh.stdout +if [ $? -ne 0 ]; then + echo "Failed to run collect-info.sh script" + exit 1 +fi + +TGZNAME="$(cat ssh.stdout | sed -n "s/EVE info is collected '\(.*\)'/\1/p")" +if [ -z "${TGZNAME}" ]; then + echo "Failed to parse eve-info tarball filename" + exit 1 +fi + +scp "${TGZNAME}" ${OUTPUT} +if [ $? -ne 0 ]; then + echo "Failed to receive eve-info" + exit 1 +fi + +FILESIZE="$(stat -c%s "$OUTPUT")" +echo "Received ${OUTPUT} with size ${FILESIZE}" \ No newline at end of file