Skip to content

Commit

Permalink
Nvidia SMI (#645)
Browse files Browse the repository at this point in the history
* Added NVIDIA SMI metrics reporter

* Turned off metric provider for Tests
  • Loading branch information
ArneTR authored Jan 7, 2024
1 parent 6098557 commit 141bd2b
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests-eco-ci-energy-estimation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ jobs:
name: 'Setup, Run, and Teardown Tests'
uses: ./.github/actions/gmt-pytest
with:
metrics-to-turn-off: '--categories RAPL Machine Sensors Debug CGroupV2 MacOS'
metrics-to-turn-off: '--categories RAPL Machine Sensors Debug CGroupV2 MacOS GPU --providers PsuEnergyAcSdiaMachineProvider'
github-token: ${{ secrets.GITHUB_TOKEN }}

- name: Eco CI Energy Estimation - Get Measurement
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/tests-vm-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
name: 'Setup, Run, and Teardown Tests'
uses: ./.github/actions/gmt-pytest
with:
metrics-to-turn-off: '--categories RAPL Machine Sensors Debug CGroupV2 MacOS --providers PsuEnergyAcSdiaMachineProvider'
metrics-to-turn-off: '--categories RAPL Machine Sensors Debug CGroupV2 MacOS GPU --providers PsuEnergyAcSdiaMachineProvider'
github-token: ${{ secrets.GITHUB_TOKEN }}

- name: Eco CI Energy Estimation - Get Measurement
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/tests-vm-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
- name: 'Setup, Run, and Teardown Tests'
uses: ./.github/actions/gmt-pytest
with:
metrics-to-turn-off: '--categories RAPL Machine Sensors Debug CGroupV2 MacOS --providers PsuEnergyAcSdiaMachineProvider'
metrics-to-turn-off: '--categories RAPL Machine Sensors Debug CGroupV2 MacOS GPU --providers PsuEnergyAcSdiaMachineProvider'
github-token: ${{ secrets.GITHUB_TOKEN }}

- name: Eco CI Energy Estimation - Get Measurement
Expand Down
3 changes: 3 additions & 0 deletions config.yml.example
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ measurement:
# psu.energy.ac.mcp.machine.provider.PsuEnergyAcMcpMachineProvider:
# resolution: 99
# psu.energy.ac.ipmi.machine.provider.PsuEnergyAcIpmiMachineProvider:
# resolution: 99
#--- GPU - Only enable these if you have GPUs with power measurement enabled in your machine
# gpu.energy.NVIDIA.SMI.component.provider.GpuEnergyNvidiaSmiComponentProvider:
# resolution: 99
#--- Sensors - these providers need the lm-sensors package installed
# lm_sensors.temperature.component.provider.LmSensorsTemperatureComponentProvider:
Expand Down
10 changes: 10 additions & 0 deletions frontend/js/helpers/config.js.example
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,16 @@ METRIC_MAPPINGS = {
'source': 'cgroup',
'explanation': 'Network I/O. Details on docs.green-coding.berlin/docs/measuring/metric-providers/network-io-cgroup-container',
},
'gpu_energy_nvidia_smi_component': {
'clean_name': 'GPU Energy',
'source': 'NVIDA SMI',
'explanation': 'Derived NVIDIA SMI based GPU energy',
},
'gpu_power_nvidia_smi_component': {
'clean_name': 'GPU Power',
'source': 'NVIDA SMI',
'explanation': 'NVIDIA SMI based GPU power',
},
'cpu_energy_rapl_msr_component': {
'clean_name': 'CPU Energy (Package)',
'source': 'RAPL',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

i=''

while getopts "i:" o; do
case "$o" in
i)
i=${OPTARG}
;;
esac
done

nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits -lms $i | awk '{ "date +%s%N" | getline timestamp; print timestamp " " $0 }'
46 changes: 46 additions & 0 deletions metric_providers/gpu/energy/nvidia/smi/component/provider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os

from metric_providers.base import BaseMetricProvider

class GpuEnergyNvidiaSmiComponentProvider(BaseMetricProvider):
def __init__(self, resolution, skip_check=False):
super().__init__(
metric_name='gpu_energy_nvidia_smi_component',
metrics={'time': int, 'value': int},
resolution=resolution,
unit='mJ',
current_dir=os.path.dirname(os.path.abspath(__file__)),
metric_provider_executable='metric-provider-nvidia-smi-wrapper.sh',
skip_check=skip_check,
)


def read_metrics(self, run_id, containers=None):
df = super().read_metrics(run_id, containers)

'''
Conversion to Joules
If ever in need to convert the database from Joules back to a power format:
WITH times as (
SELECT id, value, detail_name, time, (time - LAG(time) OVER (ORDER BY detail_name ASC, time ASC)) AS diff, unit
FROM measurements
WHERE run_id = RUN_ID AND metric = 'gpu_energy_nvidia_smi_component'
ORDER BY detail_name ASC, time ASC)
SELECT *, value / (diff / 1000) as power FROM times;
One can see that the value only changes once per second
'''

intervals = df['time'].diff()
intervals[0] = intervals.mean() # approximate first interval
df['interval'] = intervals # in microseconds
df['value'] = df.apply(lambda x: x['value'] * x['interval'] / 1_000, axis=1)
df['value'] = df.value.fillna(0) # maybe not needed
df['value'] = df.value.astype(int)

df = df.drop(columns='interval') # clean up

return df

0 comments on commit 141bd2b

Please sign in to comment.