Skip to content

Commit

Permalink
Temperature Baseline cooldown in client.py (#643)
Browse files Browse the repository at this point in the history
* Temperature Baseline cooldown in client.py

* Changed cooldown calculations [skip ci]

* using -n flag [skip ci]

* using -n flag als in temperate [skip ci]

* Other checking mechanism for lm_sensors temp [skip ci]
  • Loading branch information
ArneTR authored Jan 5, 2024
1 parent 898482b commit bb63530
Show file tree
Hide file tree
Showing 8 changed files with 92 additions and 35 deletions.
3 changes: 2 additions & 1 deletion api/api_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ def get_machine_list():
) SELECT
m.id, m.description, m.available,
m.status_code, m.updated_at, m.jobs_processing,
m.gmt_hash, m.gmt_timestamp, m.cooldown_time_after_job,
m.gmt_hash, m.gmt_timestamp,
m.base_temperature, m.current_temperature, m.cooldown_time_after_job,
(SELECT COUNT(id) FROM jobs as j WHERE j.machine_id = m.id AND j.state = 'WAITING') as job_amount,
(SELECT avg_duration FROM timings WHERE timings.machine_id = m.id )::int as avg_duration_seconds
Expand Down
5 changes: 3 additions & 2 deletions config.yml.example
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ cluster:
metrics_url: __METRICS_URL__
client:
sleep_time_no_job: 300
cooldown_time_after_job: 300
jobs_processing: "random"
time_between_control_workload_validations: 21600
send_control_workload_status_mail: False
Expand All @@ -53,7 +52,9 @@ machine:
description: "Development machine for testing"
# Takes a file path to log all the errors to it. This is disabled if False
error_log_file: False

base_temperature_value: False
base_temperature_chip: False
base_temperature_feature: False

measurement:
idle-time-start: 10
Expand Down
2 changes: 2 additions & 0 deletions docker/structure.sql
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ CREATE TABLE machines (
status_code text,
jobs_processing text,
cooldown_time_after_job integer,
base_temperature integer,
current_temperature integer,
gmt_hash text,
gmt_timestamp timestamp with time zone,
created_at timestamp with time zone DEFAULT now(),
Expand Down
11 changes: 7 additions & 4 deletions frontend/js/status.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ $(document).ready(function () {
case 'cleanup_start': return `${el} <span data-inverted data-tooltip="Cleanup after job has started"><i class="ui question circle icon fluid"></i></div>`;
case 'cleanup_end': return `${el} <span data-inverted data-tooltip="Cleanup after job has finished"><i class="ui question circle icon fluid"></i></div>`;
case 'measurement_control_start': return `${el} <span data-inverted data-tooltip="Periodic Measurement Control job has started"><i class="ui question circle icon fluid"></i></div>`;
case 'cooldown': return `${el} <span data-inverted data-tooltip="Machine is currently cooling down to base temperature"><i class="ui question circle icon fluid"></i></div>`;
case 'measurement_control_error': return `${el} <span data-inverted data-tooltip="Last periodic Measurement Control job has failed"><i class="ui question circle icon fluid"></i></div>`;
case 'measurement_control_end': return `${el} <span data-inverted data-tooltip="Periodic Measurement Control job has finished"><i class="ui question circle icon fluid"></i></div>`;
case undefined: // fallthrough
Expand All @@ -49,12 +50,14 @@ $(document).ready(function () {
return `<a href="https://github.com/green-coding-berlin/green-metrics-tool/commit/${el}">${`${el.substr(0,3)}...${el.substr(-3,3)}`}</a> (${dateToYMD(new Date(row[7]), true)})`;

}},
{ data: 8, title: 'Cooldown after Job', render: function(el) {
{ data: 8, title: 'Base temp (°)'},
{ data: 9, title: 'Current temp (°)', render: (el) => el == null ? '-' : el},
{ data: 10, title: 'Cooldown time', render: function(el) {
return (el == null) ? 'awaiting info': `${Math.round(el/60)} Minutes`;
}},
{ data: 9, title: 'Waiting Jobs'},
{ data: 10, title: 'Estimated waiting time', render: function(el, type, row) {
return (row[8] == null || row[10] == null) ? 'awaiting info' : `${Math.round(( (row[8]+row[10]) * row[9]) / 60)} Minutes`
{ data: 11, title: 'Waiting Jobs'},
{ data: 12, title: 'Estimated waiting time', render: function(el, type, row) {
return (row[10] == null || row[12] == null) ? 'awaiting info' : `${Math.round(( (row[10]+row[12]) * row[11]) / 60)} Minutes`
}},
],
deferRender: true,
Expand Down
9 changes: 8 additions & 1 deletion metric_providers/lm_sensors/source.c
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ static void output_value(int value, char *container_id) {

int main(int argc, char *argv[]) {
int c, err;
int measurement_amount = -1;
const char *config_file_name = NULL;

// These are the lists that we pass in through the command line
Expand All @@ -202,7 +203,7 @@ int main(int argc, char *argv[]) {
setlocale(LC_CTYPE, "");

while (1) {
c = getopt_long(argc, argv, "c:f:hts:i:", long_opts, NULL);
c = getopt_long(argc, argv, "c:f:hts:i:n:", long_opts, NULL);
if (c == EOF) break;
switch (c) {
case ':':
Expand Down Expand Up @@ -236,6 +237,9 @@ int main(int argc, char *argv[]) {
case 'i':
msleep_time = atoi(optarg);
break;
case 'n':
measurement_amount = atoi(optarg);
break;
default:
exit(1);
}
Expand Down Expand Up @@ -342,6 +346,9 @@ int main(int argc, char *argv[]) {
g_string_free(chip_feature_str, 1);
free(tmp_label);
}

if (measurement_amount != -1) measurement_amount--; // only decrement if switch was given to not overflow.
if (!measurement_amount) break;
usleep(msleep_time * 1000);
}
g_list_free_full(chip_feature_output, free);
Expand Down
2 changes: 2 additions & 0 deletions migrations/2024_01_05_base_temperature.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ALTER TABLE "machines" ADD COLUMN "base_temperature" integer;
ALTER TABLE "machines" ADD COLUMN "current_temperature" integer;
68 changes: 41 additions & 27 deletions tools/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@
from lib.db import DB
from lib.repo_info import get_repo_info
from tools import validate
from tools.temperature import get_temperature
from lib import email_helpers

# We currently have this dynamically as it will probably change quite a bit
STATUS_LIST = ['job_no', 'job_start', 'job_error', 'job_end', 'cleanup_start', 'cleanup_stop', 'measurement_control_start', 'measurement_control_end', 'measurement_control_error']
STATUS_LIST = ['cooldown', 'job_no', 'job_start', 'job_error', 'job_end', 'cleanup_start', 'cleanup_end', 'measurement_control_start', 'measurement_control_end', 'measurement_control_error']
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))

def set_status(status_code, data=None, run_id=None):
def set_status(status_code, cur_temp, cooldown_time_after_job, data=None, run_id=None):
# pylint: disable=redefined-outer-name
config = GlobalConfig().config
client = config['cluster']['client']
Expand All @@ -37,77 +38,90 @@ def set_status(status_code, data=None, run_id=None):

query = """
UPDATE machines
SET status_code=%s, cooldown_time_after_job=%s, jobs_processing=%s, gmt_hash=%s, gmt_timestamp=%s
SET status_code=%s, cooldown_time_after_job=%s, current_temperature=%s, base_temperature=%s, jobs_processing=%s, gmt_hash=%s, gmt_timestamp=%s
WHERE id = %s
"""

gmt_hash, gmt_timestamp = get_repo_info(CURRENT_DIR)

params = (status_code, client['cooldown_time_after_job'], client['jobs_processing'], gmt_hash, gmt_timestamp, config['machine']['id'])
params = (status_code, cooldown_time_after_job, cur_temp, config['machine']['base_temperature_value'], client['jobs_processing'], gmt_hash, gmt_timestamp, config['machine']['id'])
DB().query(query=query, params=params)

def do_cleanup():
set_status('cleanup_start')
def do_cleanup(cur_temp, cooldown_time_after_job):
set_status('cleanup_start', cur_temp, cooldown_time_after_job)

result = subprocess.run(['sudo',
os.path.join(os.path.dirname(os.path.abspath(__file__)),'cluster/cleanup.sh')],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True,)

set_status('cleanup_stop', f"stdout: {result.stdout}, stderr: {result.stderr}")
set_status('cleanup_end', cur_temp, cooldown_time_after_job, data=f"stdout: {result.stdout}, stderr: {result.stderr}")


# pylint: disable=broad-except
if __name__ == '__main__':
client_main = GlobalConfig().config['cluster']['client']
config_main = GlobalConfig().config
client_main = config_main['cluster']['client']
cwl = client_main['control_workload']

first_start = True
cooldown_time = 0
last_cooldown_time = 0

while True:
job = Job.get_job('run')

if first_start or validate.is_validation_needed(client_main['time_between_control_workload_validations']):
set_status('measurement_control_start')
current_temperature = get_temperature(
GlobalConfig().config['machine']['base_temperature_chip'],
GlobalConfig().config['machine']['base_temperature_feature']
)

if current_temperature > config_main['machine']['base_temperature_value']:
print(f"Machine is still too hot: {current_temperature}°. Sleeping for 1 minute")
set_status('cooldown', current_temperature, last_cooldown_time)
cooldown_time += 60
time.sleep(60)
continue

print('Machine is cool enough. Continuing')
last_cooldown_time = cooldown_time
cooldown_time = 0

if validate.is_validation_needed(client_main['time_between_control_workload_validations']):
set_status('measurement_control_start', current_temperature, last_cooldown_time)
validate.run_workload(cwl['name'], cwl['uri'], cwl['filename'], cwl['branch'])
set_status('measurement_control_end')
set_status('measurement_control_end', current_temperature, last_cooldown_time)

stddev_data = validate.get_workload_stddev(cwl['uri'], cwl['filename'], cwl['branch'], GlobalConfig().config['machine']['id'], cwl['comparison_window'], cwl['phase'], cwl['metrics'])
stddev_data = validate.get_workload_stddev(cwl['uri'], cwl['filename'], cwl['branch'], config_main['machine']['id'], cwl['comparison_window'], cwl['phase'], cwl['metrics'])
print('get_workload_stddev returned: ', stddev_data)

try:
message = validate.validate_workload_stddev(stddev_data, cwl['threshold'])
if GlobalConfig().config['admin']['no_emails'] is False and client_main['send_control_workload_status_mail']:
if config_main['admin']['no_emails'] is False and client_main['send_control_workload_status_mail']:
email_helpers.send_admin_email(f"Machine is operating normally. All STDDEV below {cwl['threshold'] * 100} %", "\n".join(message))
except Exception as exception:
validate.handle_validate_exception(exception)
set_status('measurement_control_error')
set_status('measurement_control_error', current_temperature, last_cooldown_time)
# the process will now go to sleep for 'time_between_control_workload_validations''
# This is as long as the next validation is needed and thus it will loop
# endlessly in validation until manually handled, which is what we want.
time.sleep(client_main['time_between_control_workload_validations'])
finally:
time.sleep(client_main['cooldown_time_after_job'])
do_cleanup()
do_cleanup(current_temperature, last_cooldown_time)

elif job:
set_status('job_start', '', job._run_id)
set_status('job_start', current_temperature, last_cooldown_time, run_id=job._run_id)
try:
job.process(docker_prune=True)
set_status('job_end', '', job._run_id)
set_status('job_end', current_temperature, last_cooldown_time, run_id=job._run_id)
except Exception as exc:
set_status('job_error', str(exc), job._run_id)
set_status('job_error', current_temperature, last_cooldown_time, data=str(exc), run_id=job._run_id)
handle_job_exception(exc, job)
finally:
time.sleep(client_main['cooldown_time_after_job'])
do_cleanup()
do_cleanup(current_temperature, last_cooldown_time)

else:
do_cleanup()
set_status('job_no')
do_cleanup(current_temperature, last_cooldown_time)
set_status('job_no', current_temperature, last_cooldown_time)
if client_main['shutdown_on_job_no'] is True:
subprocess.check_output(['sudo', 'shutdown'])
time.sleep(client_main['sleep_time_no_job'])

first_start = False
27 changes: 27 additions & 0 deletions tools/temperature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import os
import subprocess
from lib.global_config import GlobalConfig

CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))

def get_temperature(chip, feature):
if not feature or not chip:
raise RuntimeError('You must set "base_temperature_chip" and "base_temperature_feature" in the config file. Please use calibration script to determine value.')

try:
output = subprocess.check_output(
[f"{CURRENT_DIR}/../metric_providers/lm_sensors/metric-provider-binary", '-c', chip, '-f', feature, '-n', '1'],
encoding='UTF-8',
)
except (FileNotFoundError, subprocess.CalledProcessError) as exc:
raise RuntimeError('Could not get system temperature. Did you install lm_sensors and the corresponding metric provider correctly?') from exc

return int(output.split(' ')[1])/100

if __name__ == '__main__':
cur = get_temperature(
GlobalConfig().config['machine']['base_temperature_chip'],
GlobalConfig().config['machine']['base_temperature_feature']
)
print('Current temperature is', cur)
print('Base temperature is', GlobalConfig().config['machine']['base_temperature_value'])

0 comments on commit bb63530

Please sign in to comment.