diff --git a/api/api_helpers.py b/api/api_helpers.py index b9b5ce47c..912f4988b 100644 --- a/api/api_helpers.py +++ b/api/api_helpers.py @@ -93,7 +93,8 @@ def get_machine_list(): ) SELECT m.id, m.description, m.available, m.status_code, m.updated_at, m.jobs_processing, - m.gmt_hash, m.gmt_timestamp, m.cooldown_time_after_job, + m.gmt_hash, m.gmt_timestamp, + m.base_temperature, m.current_temperature, m.cooldown_time_after_job, (SELECT COUNT(id) FROM jobs as j WHERE j.machine_id = m.id AND j.state = 'WAITING') as job_amount, (SELECT avg_duration FROM timings WHERE timings.machine_id = m.id )::int as avg_duration_seconds diff --git a/config.yml.example b/config.yml.example index 832d1c8cf..d74f8478f 100644 --- a/config.yml.example +++ b/config.yml.example @@ -29,7 +29,6 @@ cluster: metrics_url: __METRICS_URL__ client: sleep_time_no_job: 300 - cooldown_time_after_job: 300 jobs_processing: "random" time_between_control_workload_validations: 21600 send_control_workload_status_mail: False @@ -53,7 +52,9 @@ machine: description: "Development machine for testing" # Takes a file path to log all the errors to it. This is disabled if False error_log_file: False - + base_temperature_value: False + base_temperature_chip: False + base_temperature_feature: False measurement: idle-time-start: 10 diff --git a/docker/structure.sql b/docker/structure.sql index b2731de39..162402fb9 100644 --- a/docker/structure.sql +++ b/docker/structure.sql @@ -11,6 +11,8 @@ CREATE TABLE machines ( status_code text, jobs_processing text, cooldown_time_after_job integer, + base_temperature integer, + current_temperature integer, gmt_hash text, gmt_timestamp timestamp with time zone, created_at timestamp with time zone DEFAULT now(), diff --git a/frontend/js/status.js b/frontend/js/status.js index c11cea553..a7a96eb28 100644 --- a/frontend/js/status.js +++ b/frontend/js/status.js @@ -34,6 +34,7 @@ $(document).ready(function () { case 'cleanup_start': return `${el} `; case 'cleanup_end': return `${el} `; case 'measurement_control_start': return `${el} `; + case 'cooldown': return `${el} `; case 'measurement_control_error': return `${el} `; case 'measurement_control_end': return `${el} `; case undefined: // fallthrough @@ -49,12 +50,14 @@ $(document).ready(function () { return `${`${el.substr(0,3)}...${el.substr(-3,3)}`} (${dateToYMD(new Date(row[7]), true)})`; }}, - { data: 8, title: 'Cooldown after Job', render: function(el) { + { data: 8, title: 'Base temp (°)'}, + { data: 9, title: 'Current temp (°)', render: (el) => el == null ? '-' : el}, + { data: 10, title: 'Cooldown time', render: function(el) { return (el == null) ? 'awaiting info': `${Math.round(el/60)} Minutes`; }}, - { data: 9, title: 'Waiting Jobs'}, - { data: 10, title: 'Estimated waiting time', render: function(el, type, row) { - return (row[8] == null || row[10] == null) ? 'awaiting info' : `${Math.round(( (row[8]+row[10]) * row[9]) / 60)} Minutes` + { data: 11, title: 'Waiting Jobs'}, + { data: 12, title: 'Estimated waiting time', render: function(el, type, row) { + return (row[10] == null || row[12] == null) ? 'awaiting info' : `${Math.round(( (row[10]+row[12]) * row[11]) / 60)} Minutes` }}, ], deferRender: true, diff --git a/metric_providers/lm_sensors/source.c b/metric_providers/lm_sensors/source.c index f27b19ae1..1cfc0a2b1 100644 --- a/metric_providers/lm_sensors/source.c +++ b/metric_providers/lm_sensors/source.c @@ -176,6 +176,7 @@ static void output_value(int value, char *container_id) { int main(int argc, char *argv[]) { int c, err; + int measurement_amount = -1; const char *config_file_name = NULL; // These are the lists that we pass in through the command line @@ -202,7 +203,7 @@ int main(int argc, char *argv[]) { setlocale(LC_CTYPE, ""); while (1) { - c = getopt_long(argc, argv, "c:f:hts:i:", long_opts, NULL); + c = getopt_long(argc, argv, "c:f:hts:i:n:", long_opts, NULL); if (c == EOF) break; switch (c) { case ':': @@ -236,6 +237,9 @@ int main(int argc, char *argv[]) { case 'i': msleep_time = atoi(optarg); break; + case 'n': + measurement_amount = atoi(optarg); + break; default: exit(1); } @@ -342,6 +346,9 @@ int main(int argc, char *argv[]) { g_string_free(chip_feature_str, 1); free(tmp_label); } + + if (measurement_amount != -1) measurement_amount--; // only decrement if switch was given to not overflow. + if (!measurement_amount) break; usleep(msleep_time * 1000); } g_list_free_full(chip_feature_output, free); diff --git a/migrations/2024_01_05_base_temperature.sql b/migrations/2024_01_05_base_temperature.sql new file mode 100644 index 000000000..14f557440 --- /dev/null +++ b/migrations/2024_01_05_base_temperature.sql @@ -0,0 +1,2 @@ +ALTER TABLE "machines" ADD COLUMN "base_temperature" integer; +ALTER TABLE "machines" ADD COLUMN "current_temperature" integer; diff --git a/tools/client.py b/tools/client.py index 9dec209ad..61c00add7 100644 --- a/tools/client.py +++ b/tools/client.py @@ -13,13 +13,14 @@ from lib.db import DB from lib.repo_info import get_repo_info from tools import validate +from tools.temperature import get_temperature from lib import email_helpers # We currently have this dynamically as it will probably change quite a bit -STATUS_LIST = ['job_no', 'job_start', 'job_error', 'job_end', 'cleanup_start', 'cleanup_stop', 'measurement_control_start', 'measurement_control_end', 'measurement_control_error'] +STATUS_LIST = ['cooldown', 'job_no', 'job_start', 'job_error', 'job_end', 'cleanup_start', 'cleanup_end', 'measurement_control_start', 'measurement_control_end', 'measurement_control_error'] CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) -def set_status(status_code, data=None, run_id=None): +def set_status(status_code, cur_temp, cooldown_time_after_job, data=None, run_id=None): # pylint: disable=redefined-outer-name config = GlobalConfig().config client = config['cluster']['client'] @@ -37,17 +38,17 @@ def set_status(status_code, data=None, run_id=None): query = """ UPDATE machines - SET status_code=%s, cooldown_time_after_job=%s, jobs_processing=%s, gmt_hash=%s, gmt_timestamp=%s + SET status_code=%s, cooldown_time_after_job=%s, current_temperature=%s, base_temperature=%s, jobs_processing=%s, gmt_hash=%s, gmt_timestamp=%s WHERE id = %s """ gmt_hash, gmt_timestamp = get_repo_info(CURRENT_DIR) - params = (status_code, client['cooldown_time_after_job'], client['jobs_processing'], gmt_hash, gmt_timestamp, config['machine']['id']) + params = (status_code, cooldown_time_after_job, cur_temp, config['machine']['base_temperature_value'], client['jobs_processing'], gmt_hash, gmt_timestamp, config['machine']['id']) DB().query(query=query, params=params) -def do_cleanup(): - set_status('cleanup_start') +def do_cleanup(cur_temp, cooldown_time_after_job): + set_status('cleanup_start', cur_temp, cooldown_time_after_job) result = subprocess.run(['sudo', os.path.join(os.path.dirname(os.path.abspath(__file__)),'cluster/cleanup.sh')], @@ -55,59 +56,72 @@ def do_cleanup(): stderr=subprocess.PIPE, check=True,) - set_status('cleanup_stop', f"stdout: {result.stdout}, stderr: {result.stderr}") + set_status('cleanup_end', cur_temp, cooldown_time_after_job, data=f"stdout: {result.stdout}, stderr: {result.stderr}") # pylint: disable=broad-except if __name__ == '__main__': - client_main = GlobalConfig().config['cluster']['client'] + config_main = GlobalConfig().config + client_main = config_main['cluster']['client'] cwl = client_main['control_workload'] - - first_start = True + cooldown_time = 0 + last_cooldown_time = 0 while True: job = Job.get_job('run') - if first_start or validate.is_validation_needed(client_main['time_between_control_workload_validations']): - set_status('measurement_control_start') + current_temperature = get_temperature( + GlobalConfig().config['machine']['base_temperature_chip'], + GlobalConfig().config['machine']['base_temperature_feature'] + ) + + if current_temperature > config_main['machine']['base_temperature_value']: + print(f"Machine is still too hot: {current_temperature}°. Sleeping for 1 minute") + set_status('cooldown', current_temperature, last_cooldown_time) + cooldown_time += 60 + time.sleep(60) + continue + + print('Machine is cool enough. Continuing') + last_cooldown_time = cooldown_time + cooldown_time = 0 + + if validate.is_validation_needed(client_main['time_between_control_workload_validations']): + set_status('measurement_control_start', current_temperature, last_cooldown_time) validate.run_workload(cwl['name'], cwl['uri'], cwl['filename'], cwl['branch']) - set_status('measurement_control_end') + set_status('measurement_control_end', current_temperature, last_cooldown_time) - stddev_data = validate.get_workload_stddev(cwl['uri'], cwl['filename'], cwl['branch'], GlobalConfig().config['machine']['id'], cwl['comparison_window'], cwl['phase'], cwl['metrics']) + stddev_data = validate.get_workload_stddev(cwl['uri'], cwl['filename'], cwl['branch'], config_main['machine']['id'], cwl['comparison_window'], cwl['phase'], cwl['metrics']) print('get_workload_stddev returned: ', stddev_data) try: message = validate.validate_workload_stddev(stddev_data, cwl['threshold']) - if GlobalConfig().config['admin']['no_emails'] is False and client_main['send_control_workload_status_mail']: + if config_main['admin']['no_emails'] is False and client_main['send_control_workload_status_mail']: email_helpers.send_admin_email(f"Machine is operating normally. All STDDEV below {cwl['threshold'] * 100} %", "\n".join(message)) except Exception as exception: validate.handle_validate_exception(exception) - set_status('measurement_control_error') + set_status('measurement_control_error', current_temperature, last_cooldown_time) # the process will now go to sleep for 'time_between_control_workload_validations'' # This is as long as the next validation is needed and thus it will loop # endlessly in validation until manually handled, which is what we want. time.sleep(client_main['time_between_control_workload_validations']) finally: - time.sleep(client_main['cooldown_time_after_job']) - do_cleanup() + do_cleanup(current_temperature, last_cooldown_time) elif job: - set_status('job_start', '', job._run_id) + set_status('job_start', current_temperature, last_cooldown_time, run_id=job._run_id) try: job.process(docker_prune=True) - set_status('job_end', '', job._run_id) + set_status('job_end', current_temperature, last_cooldown_time, run_id=job._run_id) except Exception as exc: - set_status('job_error', str(exc), job._run_id) + set_status('job_error', current_temperature, last_cooldown_time, data=str(exc), run_id=job._run_id) handle_job_exception(exc, job) finally: - time.sleep(client_main['cooldown_time_after_job']) - do_cleanup() + do_cleanup(current_temperature, last_cooldown_time) else: - do_cleanup() - set_status('job_no') + do_cleanup(current_temperature, last_cooldown_time) + set_status('job_no', current_temperature, last_cooldown_time) if client_main['shutdown_on_job_no'] is True: subprocess.check_output(['sudo', 'shutdown']) time.sleep(client_main['sleep_time_no_job']) - - first_start = False diff --git a/tools/temperature.py b/tools/temperature.py new file mode 100644 index 000000000..6f9d9bc89 --- /dev/null +++ b/tools/temperature.py @@ -0,0 +1,27 @@ +import os +import subprocess +from lib.global_config import GlobalConfig + +CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) + +def get_temperature(chip, feature): + if not feature or not chip: + raise RuntimeError('You must set "base_temperature_chip" and "base_temperature_feature" in the config file. Please use calibration script to determine value.') + + try: + output = subprocess.check_output( + [f"{CURRENT_DIR}/../metric_providers/lm_sensors/metric-provider-binary", '-c', chip, '-f', feature, '-n', '1'], + encoding='UTF-8', + ) + except (FileNotFoundError, subprocess.CalledProcessError) as exc: + raise RuntimeError('Could not get system temperature. Did you install lm_sensors and the corresponding metric provider correctly?') from exc + + return int(output.split(' ')[1])/100 + +if __name__ == '__main__': + cur = get_temperature( + GlobalConfig().config['machine']['base_temperature_chip'], + GlobalConfig().config['machine']['base_temperature_feature'] + ) + print('Current temperature is', cur) + print('Base temperature is', GlobalConfig().config['machine']['base_temperature_value'])