From 141bd2b70107db4798df4a980ff0bbf119c51f24 Mon Sep 17 00:00:00 2001 From: Arne Tarara Date: Sun, 7 Jan 2024 08:02:19 +0100 Subject: [PATCH] Nvidia SMI (#645) * Added NVIDIA SMI metrics reporter * Turned off metric provider for Tests --- .../tests-eco-ci-energy-estimation.yaml | 2 +- .github/workflows/tests-vm-main.yml | 2 +- .github/workflows/tests-vm-pr.yml | 2 +- config.yml.example | 3 ++ frontend/js/helpers/config.js.example | 10 ++++ .../metric-provider-nvidia-smi-wrapper.sh | 13 ++++++ .../energy/nvidia/smi/component/provider.py | 46 +++++++++++++++++++ 7 files changed, 75 insertions(+), 3 deletions(-) create mode 100644 metric_providers/gpu/energy/nvidia/smi/component/metric-provider-nvidia-smi-wrapper.sh create mode 100644 metric_providers/gpu/energy/nvidia/smi/component/provider.py diff --git a/.github/workflows/tests-eco-ci-energy-estimation.yaml b/.github/workflows/tests-eco-ci-energy-estimation.yaml index fecf400f8..39640efab 100644 --- a/.github/workflows/tests-eco-ci-energy-estimation.yaml +++ b/.github/workflows/tests-eco-ci-energy-estimation.yaml @@ -30,7 +30,7 @@ jobs: name: 'Setup, Run, and Teardown Tests' uses: ./.github/actions/gmt-pytest with: - metrics-to-turn-off: '--categories RAPL Machine Sensors Debug CGroupV2 MacOS' + metrics-to-turn-off: '--categories RAPL Machine Sensors Debug CGroupV2 MacOS GPU --providers PsuEnergyAcSdiaMachineProvider' github-token: ${{ secrets.GITHUB_TOKEN }} - name: Eco CI Energy Estimation - Get Measurement diff --git a/.github/workflows/tests-vm-main.yml b/.github/workflows/tests-vm-main.yml index d335de15f..ddbbc0ff1 100644 --- a/.github/workflows/tests-vm-main.yml +++ b/.github/workflows/tests-vm-main.yml @@ -34,7 +34,7 @@ jobs: name: 'Setup, Run, and Teardown Tests' uses: ./.github/actions/gmt-pytest with: - metrics-to-turn-off: '--categories RAPL Machine Sensors Debug CGroupV2 MacOS --providers PsuEnergyAcSdiaMachineProvider' + metrics-to-turn-off: '--categories RAPL Machine Sensors Debug CGroupV2 MacOS GPU --providers PsuEnergyAcSdiaMachineProvider' github-token: ${{ secrets.GITHUB_TOKEN }} - name: Eco CI Energy Estimation - Get Measurement diff --git a/.github/workflows/tests-vm-pr.yml b/.github/workflows/tests-vm-pr.yml index 7ba6038f8..45f5461e0 100644 --- a/.github/workflows/tests-vm-pr.yml +++ b/.github/workflows/tests-vm-pr.yml @@ -25,7 +25,7 @@ jobs: - name: 'Setup, Run, and Teardown Tests' uses: ./.github/actions/gmt-pytest with: - metrics-to-turn-off: '--categories RAPL Machine Sensors Debug CGroupV2 MacOS --providers PsuEnergyAcSdiaMachineProvider' + metrics-to-turn-off: '--categories RAPL Machine Sensors Debug CGroupV2 MacOS GPU --providers PsuEnergyAcSdiaMachineProvider' github-token: ${{ secrets.GITHUB_TOKEN }} - name: Eco CI Energy Estimation - Get Measurement diff --git a/config.yml.example b/config.yml.example index d74f8478f..f01228e0f 100644 --- a/config.yml.example +++ b/config.yml.example @@ -96,6 +96,9 @@ measurement: # psu.energy.ac.mcp.machine.provider.PsuEnergyAcMcpMachineProvider: # resolution: 99 # psu.energy.ac.ipmi.machine.provider.PsuEnergyAcIpmiMachineProvider: +# resolution: 99 + #--- GPU - Only enable these if you have GPUs with power measurement enabled in your machine +# gpu.energy.NVIDIA.SMI.component.provider.GpuEnergyNvidiaSmiComponentProvider: # resolution: 99 #--- Sensors - these providers need the lm-sensors package installed # lm_sensors.temperature.component.provider.LmSensorsTemperatureComponentProvider: diff --git a/frontend/js/helpers/config.js.example b/frontend/js/helpers/config.js.example index 13c0a169a..31896a2f6 100644 --- a/frontend/js/helpers/config.js.example +++ b/frontend/js/helpers/config.js.example @@ -201,6 +201,16 @@ METRIC_MAPPINGS = { 'source': 'cgroup', 'explanation': 'Network I/O. Details on docs.green-coding.berlin/docs/measuring/metric-providers/network-io-cgroup-container', }, + 'gpu_energy_nvidia_smi_component': { + 'clean_name': 'GPU Energy', + 'source': 'NVIDA SMI', + 'explanation': 'Derived NVIDIA SMI based GPU energy', + }, + 'gpu_power_nvidia_smi_component': { + 'clean_name': 'GPU Power', + 'source': 'NVIDA SMI', + 'explanation': 'NVIDIA SMI based GPU power', + }, 'cpu_energy_rapl_msr_component': { 'clean_name': 'CPU Energy (Package)', 'source': 'RAPL', diff --git a/metric_providers/gpu/energy/nvidia/smi/component/metric-provider-nvidia-smi-wrapper.sh b/metric_providers/gpu/energy/nvidia/smi/component/metric-provider-nvidia-smi-wrapper.sh new file mode 100644 index 000000000..945874eb8 --- /dev/null +++ b/metric_providers/gpu/energy/nvidia/smi/component/metric-provider-nvidia-smi-wrapper.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +i='' + +while getopts "i:" o; do + case "$o" in + i) + i=${OPTARG} + ;; + esac +done + +nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits -lms $i | awk '{ "date +%s%N" | getline timestamp; print timestamp " " $0 }' diff --git a/metric_providers/gpu/energy/nvidia/smi/component/provider.py b/metric_providers/gpu/energy/nvidia/smi/component/provider.py new file mode 100644 index 000000000..1298711ed --- /dev/null +++ b/metric_providers/gpu/energy/nvidia/smi/component/provider.py @@ -0,0 +1,46 @@ +import os + +from metric_providers.base import BaseMetricProvider + +class GpuEnergyNvidiaSmiComponentProvider(BaseMetricProvider): + def __init__(self, resolution, skip_check=False): + super().__init__( + metric_name='gpu_energy_nvidia_smi_component', + metrics={'time': int, 'value': int}, + resolution=resolution, + unit='mJ', + current_dir=os.path.dirname(os.path.abspath(__file__)), + metric_provider_executable='metric-provider-nvidia-smi-wrapper.sh', + skip_check=skip_check, + ) + + + def read_metrics(self, run_id, containers=None): + df = super().read_metrics(run_id, containers) + + ''' + Conversion to Joules + + If ever in need to convert the database from Joules back to a power format: + + WITH times as ( + SELECT id, value, detail_name, time, (time - LAG(time) OVER (ORDER BY detail_name ASC, time ASC)) AS diff, unit + FROM measurements + WHERE run_id = RUN_ID AND metric = 'gpu_energy_nvidia_smi_component' + + ORDER BY detail_name ASC, time ASC) + SELECT *, value / (diff / 1000) as power FROM times; + + One can see that the value only changes once per second + ''' + + intervals = df['time'].diff() + intervals[0] = intervals.mean() # approximate first interval + df['interval'] = intervals # in microseconds + df['value'] = df.apply(lambda x: x['value'] * x['interval'] / 1_000, axis=1) + df['value'] = df.value.fillna(0) # maybe not needed + df['value'] = df.value.astype(int) + + df = df.drop(columns='interval') # clean up + + return df