diff --git a/legal-disclaimer.md b/legal-disclaimer.md index 0564961..2aec0b8 100644 --- a/legal-disclaimer.md +++ b/legal-disclaimer.md @@ -1,6 +1,6 @@ -## Legal Notice and Disclaimer +## Legal Notice and Disclaimer -No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document. +No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document. Habana Labs disclaims all warranties, including without limitation, the implied warranties of merchantability, fitness for a particular purpose, and non-infringement, as well as any warranty arising from course of performance, course of dealing, or usage in trade. @@ -8,7 +8,7 @@ All information provided here is subject to change without notice. Habana Labs m The products described may contain design defects or errors known as errata which may cause the product to deviate from published specifications. Current characterized errata are available on request. -Software and workloads used in performance tests may have been optimized for performance only on Habana Labs hardware. Performance tests, such as SYSmark and MobileMark, are measured using specific computer systems, components, software, operations and functions. Any change to any of those factors may cause the results to vary. You should consult other information and performance tests to assist you in fully evaluating your contemplated purchases, including the performance of that product when combined with other products. +Software and workloads used in performance tests may have been optimized for performance only on Habana Labs hardware. Performance tests, such as SYSmark and MobileMark, are measured using specific computer systems, components, software, operations and functions. Any change to any of those factors may cause the results to vary. You should consult other information and performance tests to assist you in fully evaluating your contemplated purchases, including the performance of that product when combined with other products. No product or component can be absolutely secure. @@ -16,4 +16,4 @@ Habana Labs, Gaudi and SynapseAI are trademarks of Habana Labs in the U.S. and/o *Other names and brands may be claimed as the property of others. -© 2021 Habana Labs +© 2021 Habana Labs diff --git a/utils/README.md b/utils/README.md index 31ddac8..7c29b9d 100644 --- a/utils/README.md +++ b/utils/README.md @@ -4,8 +4,18 @@ By installing, copying, accessing, or using the software, you agree to be legall ## Table of Contents +- [Gaudi Utils](#gaudi-utils) + - [Table of Contents](#table-of-contents) - [Overview](#overview) - - [manage_network_ifs.sh](#manage_network_ifs) + - [manage\_network\_ifs](#manage_network_ifs) + - [Operations](#operations) + - [Up](#up) + - [Down](#down) + - [Status](#status) + - [Set IP](#set-ip) + - [Unset IP](#unset-ip) + - [check\_habana\_framework\_env](#check_habana_framework_env) + - [Habana Health Screen (HHS)](#habana-health-screen-hhs) ## Overview @@ -22,23 +32,23 @@ This script can be used as reference to bring up, take down, set IPs, unset IPs The following is the usage of the script: ``` -usage: ./manage_network_ifs.sh [options] +usage: ./manage_network_ifs.sh [options] -options: - --up toggle up all Habana network interfaces - --down toggle down all Habana network interfaces - --status print status of all Habana network interfaces - --set-ip set IP for all internal Habana network interfaces - --unset-ip unset IP from all internal Habana network interfaces - -v, --verbose print more logs - -h, --help print this help +options: + --up toggle up all Habana network interfaces + --down toggle down all Habana network interfaces + --status print status of all Habana network interfaces + --set-ip set IP for all internal Habana network interfaces + --unset-ip unset IP from all internal Habana network interfaces + -v, --verbose print more logs + -h, --help print this help Note: Please run this script with one operation at a time ``` ## Operations -Before executing any operation, this script finds all the Habana network interfaces available on the system and stores the Habana interface information into a list. -The list will be used for the operations. If no Habana network interface is found, the script will exit. +Before executing any operation, this script finds all the Habana network interfaces available on the system and stores the Habana interface information into a list. +The list will be used for the operations. If no Habana network interface is found, the script will exit. ### Up @@ -87,4 +97,40 @@ Check health of HPUs for PyTorch optional arguments: -h, --help show this help message and exit --cards CARDS Set number of cards to test (default: 1) +``` + +## Habana Health Screen (HHS) + +**Habana Health Screen** (HHS) tool has been developed to verify the cluster network health through a suite of diagnostic tests. The test +includes checking gaudi port status, running small workloads, and running standard collective operations arcoss multiple systems. + +``` bash +usage: screen.py [-h] [--initialize] [--screen] [--target-nodes TARGET_NODES] + [--job-id JOB_ID] [--round ROUND] [--config CONFIG] + [--hhs-check [{node,hccl-demo,none}]] [--node-write-report] + [--node-name NODE_NAME] [--logs-dir LOGS_DIR] + +optional arguments: + -h, --help show this help message and exit + --initialize Downloads Necessary Repos and Creates Report Template + --screen Starts Health Screen for Cluster + --target-nodes TARGET_NODES + List of target nodes + --job-id JOB_ID Needed to identify hccl-demo running log + --round ROUND Needed to identify hccl-demo running round log + --config CONFIG Configuration file for Health Screener + --hhs-check [{node,hccl-demo,none}] + Check HHS Status for Node (Ports status, Device Acquire Fail) or all_reduce + (HCCL_DEMO between paris of nodes) + --node-write-report Write Individual Node Health Report + --node-name NODE_NAME Name of Node + --logs-dir LOGS_DIR Output directory of health screen results +``` + +To run a full HHS test, run the below command: + +``` bash +# Creates HHS Report and screens clusters for any infected nodes. +# Will check Level 1 and 2 by default +python screen.py --initialize --screen ``` \ No newline at end of file diff --git a/utils/habana_health_screen/.gitignore b/utils/habana_health_screen/.gitignore new file mode 100644 index 0000000..d44aff4 --- /dev/null +++ b/utils/habana_health_screen/.gitignore @@ -0,0 +1,5 @@ +tmp/* +build/* +logs/* +.graph_dump/* +__pycache__* \ No newline at end of file diff --git a/utils/habana_health_screen/HNodes.py b/utils/habana_health_screen/HNodes.py new file mode 100644 index 0000000..4cf0abf --- /dev/null +++ b/utils/habana_health_screen/HNodes.py @@ -0,0 +1,221 @@ +# Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os, time, yaml, csv +import logging +from multiprocessing.pool import Pool + +from HabanaHealthReport import HabanaHealthReport +from utilities import run_cmd, create_logger +from hccl_demo_helper import find_groups + +_logger = logging.getLogger("habana_health_screener") + + +class HNodes(): + + def __init__(self, health_report=HabanaHealthReport()): + """ Keeps Track of Nodes and their current states + + Args: + health_report (HabanaHealthReport, optional): HHS Health Report. Defaults to creating a new HabanaHealthReport(). + """ + self.all_nodes = list() + self.launcher_nodes = list() + self.worker_nodes = list() + self.healthy_nodes = list() + self.infected_nodes = list() + + self.groups_tracker = list() + + self.health_report = health_report + self.log_dir = health_report.f_dir + + + +class HNode(): + + def __init__(self, name="", health_report=HabanaHealthReport(), num_checks_link_state=10, log_level=logging.INFO): + self.name = name + if name == "" and "MY_NODE_NAME" in os.environ: + self.name = os.environ["MY_NODE_NAME"] + + + self.cards = dict() + self.num_checks_link_state = num_checks_link_state + + self.health_report = health_report + if not self.health_report.exist(): + self.health_report.create() + + self.logger, _ = create_logger(logger_name=self.name, logger_file_name=self.name, f_path=f"{health_report.f_dir}/L1", level=log_level) + + + def scan_cards(self): + self.logger.info(f"Scanning cards info on Node: {self.name}") + + cmd = "hl-smi -Q index,module_id,bus_id,memory.used,temperature.aip -f csv,noheader" + output = run_cmd(cmd) + + reader = csv.reader(output.split('\n'), delimiter=',') + for row in reader: + if len(row) == 0: + continue + + i = row[0] + module_id = row[1].strip() + pci_address = row[2] + memory_used = int(row[3].split()[0]) + temperature_C = int(row[4].split()[0]) + + card = HCard(index=i, module_id=module_id, pci_address=pci_address, memory_used=memory_used, temperature=temperature_C, logger=self.logger) + self.cards[i] = card + + self.cards = dict(sorted(self.cards.items())) + + def health_check(self, target_cards=[], write_report=False): + checked_cards = list() + + if len(target_cards) == 0: + target_cards = self.cards.keys() + + for i in target_cards: + card = self.cards[str(i)] + card.check_health(num_checks_link_state=self.num_checks_link_state) + + checked_cards.append(card) + self.logger.info(card) + + if(write_report): + self.health_report.write_rows(node_id=self.name, cards=checked_cards) + + + +class HCard(): + + def __init__(self, index=-1, module_id=-1, pci_address="", memory_used=-1, framework="pytorch", temperature=-1, logger=None): + self.logger = logger + self.index = index + self.module_id = module_id + self.pci_address = pci_address + self.memory_used = memory_used + self.temperature_C = temperature + self.temperature_state_C = "" + + self.framework = framework + self.down_links = list() + self.device_acquire_fail = False + self.multi_node_fail = False + + self.external_ports = [1, 8, 9] + self.incorrect_ports_direction = list() + + def check_health(self,num_checks_link_state=10): + self.check_link_state(attempts=num_checks_link_state, sleep_sec=0.2) + self.check_device_acquire_fail() + self.check_temperature_state() + + def check_link_state(self, attempts=10, sleep_sec=0.5): + self.logger.debug(f"Checking {self.pci_address} Link State. Will check {attempts} times") + cmd = f"hl-smi -n link -i {self.pci_address}" + down_links = set() + + for a in range(attempts): + output = run_cmd(cmd) + links_state = output.strip().split("\n") + + for i, status in enumerate(links_state): + if ("DOWN" in status): + down_links.add(i) + self.logger.debug(f"Attempt: {a} Port: {i} DOWN") + + time.sleep(sleep_sec) + + self.down_links = list(down_links) + + return self.down_links + + + def check_port_direction(self): + self.logger.debug(f"Checking {self.pci_address} Port Directions") + + incorrect_ports_direction = list() + cmd = f"hl-smi -n ports -i {self.pci_address}" + output = run_cmd(cmd) + + ports_direction = output.strip().split("\n") + if ports_direction[-1] == "": + ports_direction.pop() + + for i, direction in enumerate(ports_direction): + if i in self.external_ports: + if "internal" in direction: + incorrect_ports_direction.append(i) + else: + if "external" in direction: + incorrect_ports_direction.append(i) + + self.incorrect_ports_direction = incorrect_ports_direction + + return incorrect_ports_direction + + def check_device_acquire_fail(self): + self.logger.debug(f"Checking {self.pci_address} for Device Acquire Issues") + + from build.Setup_and_Install.utils import check_habana_framework_env + + self.device_acquire_fail = False + fw_test = check_habana_framework_env.pytorch_test + if self.framework == "tensorflow": + fw_test = check_habana_framework_env.tensorflow_test + + try: + with Pool() as pool: + result = pool.apply(fw_test, args=(self.module_id)) + + except (RuntimeError, AssertionError, Exception) as e: + self.device_acquire_fail = True + self.logger.warning(f"{self.pci_address} Device Acquire Failure") + + return self.device_acquire_fail + + def check_temperature_state(self): + max_good_temperature = 83 + base_temperature = 25 + max_delta = 25 + + if self.temperature_C >= max_good_temperature: + self.temperature_state_C = "CRITICAL" + elif self.temperature_C - base_temperature >= max_delta: + self.temperature_state_C = "WARN" + + def check_temperature_state(self): + max_good_temperature = 83 + base_temperature = 25 + max_delta = 25 + + if self.temperature_C >= max_good_temperature: + self.temperature_state_C = "CRITICAL" + elif self.temperature_C - base_temperature >= max_delta: + self.temperature_state_C = "WARN" + + def __str__(self): + report_str = f""" Index: {self.index} + Module Id: {self.module_id} + PCI Address: {self.pci_address} + Temperature: {self.temperature_C} C + Temperature State: {self.temperature_state_C} + Down Links: {self.down_links} + Device Acquire Fail: {self.device_acquire_fail}""" + + return report_str + diff --git a/utils/habana_health_screen/HabanaHealthReport.py b/utils/habana_health_screen/HabanaHealthReport.py new file mode 100644 index 0000000..4ac194e --- /dev/null +++ b/utils/habana_health_screen/HabanaHealthReport.py @@ -0,0 +1,336 @@ +# Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os, csv, time, shutil, fcntl, glob, copy +from collections import defaultdict +from tempfile import NamedTemporaryFile + +from utilities import copy_files + +import logging + +_logger = logging.getLogger("habana_health_screener") + +class HabanaHealthReport(): + + def __init__(self, f_dir="tmp", report_name="health_report.csv"): + """ Initialize Habana Health Report Class + + Args: + f_dir (str, optional): File Directory to store Health Report logs and results. Defaults to "tmp". + report_name (str, optional): File name of Health Report csv. Defaults to "health_report.csv". + """ + self.header = ["node_id", "index", "module_id", "pci_address", "temperature_C", "temperature_state_C", "device_acquire_fail", "down_links", "multi_node_fail", "missing"] + + self.f_dir = f_dir + self.report_name = report_name + self.f_path = f"{self.f_dir}/{self.report_name}" + + self.header_hccl_demo = ["round","group_id", "node_ids", "num_nodes", "multi_node_fail", "missing", "qpc_fail"] + self.f_path_hccl_demo = f"{self.f_dir}/{os.path.splitext(self.report_name)[0]}_hccl_demo.csv" + + + def create(self, create_base=True, create_hccl_demo=False): + """Create CSV Health Report Files. One for Base Health Checks and HCCL Demo Checks + + Args: + create_base (bool, optional): Create Base Health_Report CSV file. Defaults to True. + create_hccl_demo (bool, optional): Create HCCL_DEMO_Health_Report if it doesn't exist. Defaults to False. + """ + + dir_name = os.path.dirname(self.f_path) + if not os.path.exists(dir_name): + os.makedirs(dir_name) + + if create_base: + with open(self.f_path, "w+", newline='') as f: + writer = csv.DictWriter(f, fieldnames=self.header, extrasaction='ignore') + writer.writeheader() + _logger.info(f"Created {self.f_path} with header: {self.header}") + + if create_hccl_demo and not self.exist(level=2): + with open(self.f_path_hccl_demo, "w+", newline='') as f: + writer = csv.DictWriter(f, fieldnames=self.header_hccl_demo, extrasaction='ignore') + writer.writeheader() + _logger.info(f"Created {self.f_path_hccl_demo} with header: {self.header_hccl_demo}") + + def exist(self, level=1): + """Checks to see if Base Health Report exist + + Args: + level (int, optional): Health Screen level report csv to check. Defaults to 1. + + Returns: + bool: Returns True if the Base Health Report (self.f_path) or HCCL_DEMO Health Report (self.f_path_hccl_demo) exist + """ + f_path = self.f_path + + if level == 2: + f_path = self.f_path_hccl_demo + + return os.path.exists(f_path) + + def write_rows(self, cards=list(), node_id="", data=list(), level=1): + """ Write health check results to Health Report CSV. Can write multiple rows at once + + Args: + cards ([HCard], optional): Level 1 HCards to report about. Defaults to list(). + node_id (str, optional): Node ID of HCards. Defaults to "". + data (_type_, optional): Health Report CSV Row data. Defaults to list(). + level (int, optional): Health Screen Level. Defaults to 1. + """ + + if level == 1: + f_path = self.f_path + header = self.header + + if len(data) == 0: + for c in cards: + d = c.__dict__ + d["node_id"] = node_id + data.append(d) + + elif level == 2: + f_path = self.f_path_hccl_demo + header = self.header_hccl_demo + + with open(f_path, "a", newline='') as f: + fcntl.flock(f, fcntl.LOCK_EX) + writer = csv.DictWriter(f, fieldnames=header, extrasaction='ignore') + writer.writerows(data) + time.sleep(0.1) + fcntl.flock(f, fcntl.LOCK_UN) + + def update_health_report(self, detected_nodes, infected_nodes, missing_nodes): + """ Update health_report with hccl_demo results + + Args: + detected_nodes (list[str]): List of detected node_ids + infected_nodes (list[str]): List of infected node_ids + missing_nodes (list[str]): List of missing node_ids + """ + tempfile = NamedTemporaryFile(mode='w', delete=False) + detected_nodes_cp = detected_nodes.copy() + + with open(self.f_path, 'r', newline='') as csvfile, tempfile: + reader = csv.DictReader(csvfile) + writer = csv.DictWriter(tempfile, fieldnames=self.header) + + writer.writeheader() + for row in reader: + if row["node_id"] in infected_nodes or row["node_id"] in missing_nodes: + row["multi_node_fail"] = True + elif row["node_id"] in detected_nodes_cp: + row["multi_node_fail"] = False + row["missing"] = False + + writer.writerow(row) + + missing_nodes.discard(row["node_id"]) + detected_nodes_cp.discard(row["node_id"]) + + # These are unreported Detected Nodes. Add to Report + if len(detected_nodes_cp): + for n in detected_nodes_cp: + writer.writerow({"node_id": n, "multi_node_fail": False, "missing": False}) + + # These are unreported Missing Nodes. Add to Report + if len(missing_nodes): + for n in missing_nodes: + writer.writerow({"node_id": n, "multi_node_fail": True, "missing": True}) + + shutil.move(tempfile.name, self.f_path) + + def update_hccl_demo_health_report(self, round, all_node_pairs, multi_node_fail, qpc_fail, missing_nodes): + """ Update health_report with hccl_demo results, based on infected_nodes. + + Args: + all_node_pairs (list[str]): List of all node pairs reported by Level 2 round + multi_node_fail (list[str]): List of Node Pairs that failed HCCL_Demo Test + qpc_fail (list[str]): List of Node Pairs that failed HCCL_Demo Test due to QPC error + missing_nodes (list[str]): List of Node Pairs that couldn't run HCCL_Demo + """ + tempfile = NamedTemporaryFile(mode='w', delete=False) + + with open(self.f_path_hccl_demo, 'r', newline='') as csvfile, tempfile: + reader = csv.DictReader(csvfile) + writer = csv.DictWriter(tempfile, fieldnames=self.header_hccl_demo, extrasaction='ignore') + + writer.writeheader() + for row in reader: + if(row["round"] == round): + row["multi_node_fail"] = (row["node_ids"] in multi_node_fail) + row["qpc_fail"] = (row["node_ids"] in qpc_fail) + row["missing"] = (row["node_ids"] in missing_nodes) + + if row["node_ids"] in all_node_pairs: + del all_node_pairs[row["node_ids"]] + + writer.writerow(row) + + # These are unreported node_pairs. Add remaining node pairs + if len(all_node_pairs): + writer.writerows(list(all_node_pairs.values())) + + shutil.move(tempfile.name, self.f_path_hccl_demo) + + def check_screen_complete(self, num_nodes, hccl_demo=False, round=0): + """ Check on status of Health Screen Check. + Screen considered done if all nodes health checks are done + + Args: + num_nodes (int): Number of Nodes screened + hccl_demo (bool, optional): Status of HCCL_DEMO all reduce test. Defaults to False. + round (int, optional): Level 2 Round. This will only check Level 2 round results. This is ignored for Level 1 runs. + + Returns: + bool: Status of Screen. If all nodes are found, screening is done + """ + f_path = self.f_path if (not hccl_demo) else self.f_path_hccl_demo + n_cards_per_node = 8 + + with open(f_path, "r", newline='') as f: + reader = csv.DictReader(f) + + if hccl_demo: + n_cards = 0 + for row in reader: + if(int(row["round"]) == round): + n_cards += (int(row["num_nodes"]) * n_cards_per_node) + else: + n_cards = len(list(reader)) + + total_cards = n_cards_per_node * num_nodes + has_all_nodes_info = (n_cards == total_cards) + num_found_nodes = n_cards // n_cards_per_node + + return has_all_nodes_info, num_found_nodes + + def extract_node_info(self): + """ Extracts Detected, Infected, and Missing Nodes from Health Report. + + Returns: + (set, set, set): (Detected Nodes, Infected Nodes, Missing Nodes) + """ + detected_nodes = set() + missing_nodes = set() + device_acquire_fail_set = set() + down_links_set = set() + temperature_fail_set = set() + temperature_warn_set = set() + + with open(self.f_path, "r", newline='') as f: + reader = csv.DictReader(f) + for row in reader: + detected_nodes.add(row["node_id"]) + + if row["device_acquire_fail"] == "True": + device_acquire_fail_set.add(row["node_id"]) + if row["down_links"] != "[]" and row["down_links"] != "": + down_links_set.add(row["node_id"]) + if row["missing"] == "True": + missing_nodes.add(row["node_id"]) + if row["temperature_state_C"] == "CRITICAL": + temperature_fail_set.add(row["node_id"]) + if row["temperature_state_C"] == "WARN": + temperature_warn_set.add(row["node_id"]) + + if(len(device_acquire_fail_set)): + _logger.info(f"{len(device_acquire_fail_set)} Infected (Device Acquire fail): {sorted(list(device_acquire_fail_set))}") + if(len(down_links_set)): + _logger.info(f"{len(down_links_set)} Infected (Down Links): {sorted(list(down_links_set))}") + if(len(temperature_warn_set)): + _logger.info(f"{len(temperature_warn_set)} Infected (Temperature WARN): {sorted(list(temperature_warn_set))}") + if(len(temperature_fail_set)): + _logger.info(f"{len(temperature_fail_set)} Infected (Temperature CRITICAL): {sorted(list(temperature_fail_set))}") + + infected_nodes = set() + infected_nodes.update(device_acquire_fail_set) + infected_nodes.update(down_links_set) + infected_nodes.update(temperature_fail_set) + infected_nodes.update(temperature_warn_set) + + return detected_nodes, infected_nodes, missing_nodes + + + def extract_hccl_demo_info(self): + """ Extracts Detected, Infected, and Missing Nodes from HCCL DEMO Health Report + + Returns: + (set, set, set): (Detected Nodes, Infected Nodes, Missing Nodes) + """ + detected_nodes = set() + infected_nodes = set() + missing_nodes = set() + fail_checks = defaultdict(list) + missing_checks = defaultdict(list) + + with open(self.f_path_hccl_demo, "r", newline='') as f: + reader = csv.DictReader(f) + for row in reader: + node_ids = row["node_ids"].strip("[']").replace("'","").split(', ') + detected_nodes.update(node_ids) + + for n in node_ids: + fail_status = int(row["multi_node_fail"] == "True") + fail_checks[n].append(fail_status) + + missing_status = int(row["missing"] == "True") + missing_checks[n].append(missing_status) + + for n, v in fail_checks.items(): + if sum(v) == len(v): + infected_nodes.add(n) + + for n, v in missing_checks.items(): + if sum(v) == len(v): + missing_nodes.add(n) + + detected_nodes -= missing_nodes + infected_nodes -= missing_nodes + + _logger.info(f"{len(infected_nodes)} Infected (HCCL): {sorted(list(infected_nodes))}") + + return detected_nodes, infected_nodes, missing_nodes + + def gather_health_report(self, level, remote_path, hosts): + """ Gathers Health Report from all hosts + + Args: + level (str): HHS Level + remote_path (str): Remote Destintation of HHS Report + hosts (list, optional): List of IP Addresses to gather HHS Reports + """ + copy_files(src=f"{remote_path}/habana_health_screen/{self.f_dir}/L{level}", + dst=f"{self.f_dir}", + hosts=hosts, + to_remote=False) + + def consolidate_health_report(self, level, report_dir): + """ Consolidates the health_report_*.csv from worker pods into a single master csv file + + Args: + level (str): HHS Level + report_dir (str): Directory of CSV files to merge + """ + data = list() + path = f"{report_dir}/L{level}/health_report_*.csv" + csv_files = glob.glob(path) + + for f in csv_files: + with open(f, 'r', newline='') as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + data.append(row) + + self.write_rows(data=data, level=level) + diff --git a/utils/habana_health_screen/README.md b/utils/habana_health_screen/README.md new file mode 100644 index 0000000..ae23ced --- /dev/null +++ b/utils/habana_health_screen/README.md @@ -0,0 +1,275 @@ +# Habana Health Screen 1.0.0 + +A large scale Gaudi cluster contains a lot of moving parts. To ensure distributed training proceeds smoothly, it is recommended to check the +cluster network health. Troubleshooting issues on a large cluster can be a tedious act. To simplify the debugging process the +**Habana Health Screen** (HHS) tool has been developed to verify the cluster network health through a suite of diagnostic tests. The test +includes checking gaudi port status, running small workloads, and running standard collective operations arcoss multiple systems + +HHS is capable of running on a Kubernetes cluster or on a baremetal cluster. It is an active scan, which will block other users from training +on a gaudi systems until the scans are complete. At the end of the scans, HHS produces a CSV report detailing the state of each gaudi card. + +It is reccomended to run HHS in the below scenarios: + +* After a system upgrade/update +* Before running a long term training +* Pinpointing problematic systems in a cluster if a problem can't be isolated to a single system + +HHS runs a multi-tiered configurable scan: + +* Level 1 - Individual System Diagnostics +* Level 2 - Multi-System Communication Diagnostics + +## Level 1 - Individual System Diagnostic + +Level 1 focuses on individual Gaudi Cards Health Diagnostics. + +| Test | Description | +| ------------------------- | ---------------------------------------------------------- | +| Gaudi Ports Status | Checks if ports are DOWN | +| Device Acquire Failures | Checks if devices are busy | + +**2 System Cluster Example** + +Here is an example of running HHS on a 2 system cluster. It identifies the Gaudi Cards that have down links, device acquire issues, and +flags for multi node communication failure + +| node_id | index | module_id | pci_address | temperature_C | temperature_C | device_acquire_fail | down_links | multi_node_fail | missing | +| -------- | ----- | --------- | ------------ | ------------- | ------------- | ------------------- | ---------- | ----------------| ------- | +| sys-9-05 | 0 | 3 | 0000:19:00.0 | 22 | | False | [9] | True | False | +| sys-9-05 | 1 | 7 | 0000:b3:00.0 | 60 | WARN | False | [7] | True | False | +| sys-9-05 | 2 | 2 | 0000:1a:00.0 | 84 | CRITICAL | False | [5, 7] | True | False | +| sys-9-05 | 3 | 6 | 0000:b4:00.0 | 23 | | False | [4] | True | False | +| sys-9-05 | 4 | 1 | 0000:33:00.0 | 25 | | False | [4, 5] | True | False | +| sys-9-05 | 5 | 5 | 0000:cc:00.0 | 24 | | False | [4, 5] | True | False | +| sys-9-05 | 6 | 0 | 0000:34:00.0 | 27 | | False | [4, 5] | True | False | +| sys-4-04 | 7 | 4 | 0000:cd:00.0 | 28 | | False | [] | False | False | +| sys-4-04 | 0 | 3 | 0000:19:00.0 | 28 | | False | [] | False | False | +| sys-4-04 | 1 | 7 | 0000:b3:00.0 | 28 | | False | [] | False | False | +| sys-4-04 | 2 | 2 | 0000:1a:00.0 | 28 | | False | [] | False | False | +| sys-4-04 | 3 | 0 | 0000:34:00.0 | 24 | | False | [] | False | False | +| sys-4-04 | 4 | 6 | 0000:b4:00.0 | 24 | | False | [] | False | False | +| sys-4-04 | 5 | 1 | 0000:33:00.0 | 21 | | False | [] | False | False | +| sys-4-04 | 6 | 5 | 0000:cc:00.0 | 21 | | False | [] | False | False | +| sys-4-04 | 7 | 4 | 0000:cd:00.0 | 26 | | False | [] | False | False | + +``` log +[2023-02-07 09:02:39] INFO Infected (Temperature WARN) 1 Node: ['sys-9-05'] +[2023-02-07 09:02:39] INFO Infected (Temperature CRITICAL) 1 Node: ['sys-9-05'] +[2023-02-07 09:02:39] INFO Infected 1 Node: ['sys-9-05'] +[2023-02-07 09:02:39] INFO Missing 0 Node: [] +[2023-02-07 09:02:39] INFO Healthy 1 Node: ["sys-4-04"] + +[2023-02-07 09:02:39] INFO Detected 36 Node: ["sys-4-04","sys-9-05"] + +``` + + +## Level 2 - Multi-System Communication Diagnostics + +Level 2 performs a collective communication all reduce test between multiple system through [HCCL_DEMO](https://github.com/HabanaAI/hccl_demo] repo. +It runs X rounds with unique pairs of systems ensuring that a system is able to communicate across different sets of systems. If no +pair systems have failed, then the testing will stop. If there was a system with communication issues, it will be flagged on the +first round. + +** Multi Node Cluster Example** + +Here is an example of running HHS for 2 rounds and the results gets recorded to `hccl_demo_health_report.csv`. It identifies node pairs that failed the all_reduce test. If "True" is flagged +in the multi_node_fail column, then one of the nodes has a communication issue. List of infected nodes will be printed out to +the log as well as the `health_report.csv` multi_node_fail column. + +| round | group_id | node_ids | num_nodes | multi_node_fail | missing | qpc_fail | +| ----- | -------- | ------------------------ | --------- | --------------- | ------- | -------- | +| 0 | 11 | ['sys-7-01', 'sys-9-05'] | 2 | True | False | True | +| 0 | 4 | ['sys-2-03', 'sys-4-04'] | 2 | True | True | False | +| 0 | 13 | ['sys-6-06', 'sys-9-06'] | 2 | False | False | False | +| 0 | 1 | ['sys-3-01', 'sys-9-01'] | 2 | False | False | False | +| 0 | 2 | ['sys-6-03', 'sys-8-01'] | 2 | False | False | False | +| 0 | 0 | ['sys-3-06', 'sys-6-02'] | 2 | False | False | False | +| 0 | 10 | ['sys-2-01', 'sys-4-01'] | 2 | False | False | False | +| 0 | 6 | ['sys-6-05', 'sys-9-03'] | 2 | False | False | False | +| 0 | 14 | ['sys-4-05', 'sys-8-03'] | 2 | False | False | False | +| 0 | 12 | ['sys-6-04', 'sys-8-05'] | 2 | False | False | False | +| 0 | 8 | ['sys-7-06', 'sys-9-02'] | 2 | False | False | False | +| 0 | 5 | ['sys-3-04', 'sys-7-02'] | 2 | False | False | False | +| 0 | 3 | ['sys-4-03', 'sys-6-01'] | 2 | False | False | False | +| 0 | 7 | ['sys-2-06', 'sys-3-03'] | 2 | False | False | False | +| 0 | 9 | ['sys-2-04', 'sys-9-04'] | 2 | False | False | False | +| 1 | 1 | ['sys-3-04', 'sys-4-05'] | 2 | False | False | False | +| 1 | 20 | ['sys-2-03', 'sys-7-02'] | 2 | True | True | False | +| 1 | 19 | ['sys-3-01', 'sys-9-03'] | 2 | False | False | False | +| 1 | 0 | ['sys-3-03', 'sys-9-04'] | 2 | False | False | False | +| 1 | 12 | ['sys-4-04', 'sys-6-02'] | 2 | False | False | False | +| 1 | 9 | ['sys-4-03', 'sys-6-05'] | 2 | False | False | False | +| 1 | 14 | ['sys-3-06', 'sys-6-04'] | 2 | False | False | False | +| 1 | 15 | ['sys-4-01', 'sys-8-03'] | 2 | False | False | False | +| 1 | 3 | ['sys-8-01', 'sys-9-05'] | 2 | True | False | False | +| 1 | 8 | ['sys-6-03', 'sys-9-02'] | 2 | False | False | False | +| 1 | 7 | ['sys-2-06', 'sys-6-01'] | 2 | False | False | False | +| 1 | 10 | ['sys-6-06', 'sys-8-06'] | 2 | False | False | False | +| 1 | 11 | ['sys-3-02', 'sys-7-04'] | 2 | False | False | False | +| 1 | 17 | ['sys-8-04', 'sys-8-05'] | 2 | False | False | False | +| 1 | 18 | ['sys-4-02', 'sys-9-01'] | 2 | False | False | False | +| 1 | 16 | ['sys-2-02', 'sys-9-06'] | 2 | False | False | False | + +Logs show that we have 1 Infected Nodes and 1 Missing Node. Missing node represents a node that hasn't been tested yet and there are standard checks to see why it hasn't +been tested, such as having missing cards, it is occupied by another session, or it is a MISC use case. + +``` log +[2023-02-07 09:02:39] INFO Infected 1 Node: ['sys-9-05'] +[2023-02-07 09:02:39] INFO Missing 1 Node: ['sys-2-03'] +[2023-02-07 09:02:39] INFO Healthy 34 Node: ["sys-2-01","sys-2-02","sys-2-03","sys-2-04","sys-2-06","sys-3-01","sys-3-02","sys-3-03","sys-3-04","sys-3-06","sys-4-01","sys-4-02","sys-4-03","sys-4-04","sys-4-05","sys-6-01","sys-6-02","sys-6-03","sys-6-04","sys-6-05","sys-6-06","sys-7-01","sys-7-02","sys-7-04","sys-7-06","sys-8-01","sys-8-03","sys-8-04","sys-8-05","sys-8-06","sys-9-01","sys-9-02","sys-9-03","sys-9-04","sys-9-06"] + +[2023-02-07 09:02:39] INFO Detected 36 Node: ["sys-2-01","sys-2-02","sys-2-03","sys-2-04","sys-2-06","sys-3-01","sys-3-02","sys-3-03","sys-3-04","sys-3-06","sys-4-01","sys-4-02","sys-4-03","sys-4-04","sys-4-05","sys-6-01","sys-6-02","sys-6-03","sys-6-04","sys-6-05","sys-6-06","sys-7-01","sys-7-02","sys-7-04","sys-7-06","sys-8-01","sys-8-03","sys-8-04","sys-8-05","sys-8-06","sys-9-01","sys-9-02","sys-9-03","sys-9-04","sys-9-05","sys-9-06"] +[2023-02-07 09:02:39] INFO 1 Nodes w/ missing cards: ['sys-2-03'] +``` + +## Setup + +HHS is compatible with python3 default packages and does not require additional packages +to be installed + +If your setup envionrment requires custom configruation, update the yaml files located in the templates folder. The default template +relies on storing HHS in a shared file system. + +If running on bare metal system, then install `pdsh` to your system. + +Update [config.yaml](config.yaml) to match your system envionrment + +``` yaml +# Sets HHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal). +system-info: + type: "k8s" + # Namespace is only required for k8s settings + namespace: "habanalabs" + # Can specify specific systems. For k8s, to scan entire cluster comment out hostfile + hostfile: "./hostfile" + + # Bare Metal Configurations + ssh-path: "./ssh" + tcp-interface: "10.3.124.0/24" + +# Image to run Habana Health Screen +image: "vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest" + +# Node Label used to identify a Gaudi Node +gaudi-node-label: "brightcomputing.com/node-category=gaudi" + +# Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL) +log-level: "DEBUG" + +# Level 1 - Checks Individual Node Health (Ports status, Device Acquire failure) +level-1: + run: true + timeout_s: 300 + # Number of times to check Port Status + num-checks-link-state: 10 + +# Level 2 - Checks All Reduce between node pairs in the cluster. +level-2: + run: true + timeout_s: 180 + # Number of times to check Network connections between nodes + num-rounds: 5 +``` + +To learn the features of HHS, run the below command: + +``` bash +python screen.py --help + +usage: screen.py [-h] [--initialize] [--screen] [--target-nodes TARGET_NODES] + [--job-id JOB_ID] [--round ROUND] [--config CONFIG] + [--hhs-check [{node,hccl-demo,none}]] [--node-write-report] + [--node-name NODE_NAME] [--logs-dir LOGS_DIR] + +optional arguments: + -h, --help show this help message and exit + --initialize Downloads Necessary Repos and Creates Report Template + --screen Starts Health Screen for Cluster + --target-nodes TARGET_NODES + List of target nodes + --job-id JOB_ID Needed to identify hccl-demo running log + --round ROUND Needed to identify hccl-demo running round log + --config CONFIG Configuration file for Health Screener + --hhs-check [{node,hccl-demo,none}] + Check HHS Status for Node (Ports status, Device Acquire Fail) or all_reduce + (HCCL_DEMO between paris of nodes) + --node-write-report Write Individual Node Health Report + --node-name NODE_NAME Name of Node + --logs-dir LOGS_DIR Output directory of health screen results +``` + +To Run HHS, run the below command: + +``` bash +# Creates HHS Report and screens clusters for any infected nodes. +# Will check Level 1 and 2 by default +python screen.py --initialize --screen +``` + +### Run on BareMetal + +To run on bare-metal systems update the [config.yaml](config.yaml) to use bare-metal configuration. + +``` yaml +# Sets HHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal). +system-info: + type: "bare-metal" + # Namespace is only required for k8s settings + namespace: "habanalabs" + # Can specify specific systems. For k8s, to scan entire cluster comment out hostfile + hostfile: "./hostfile" + + # Bare Metal Configurations + ssh-path: "./ssh" + tcp-interface: "10.3.124.0/24" + +# Image to run Habana Health Screen +image: "vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest" + +# Node Label used to identify a Gaudi Node +gaudi-node-label: "brightcomputing.com/node-category=gaudi" + +# Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL) +log-level: "DEBUG" + +# Level 1 - Checks Individual Node Health (Ports status, Device Acquire failure) +level-1: + run: true + timeout_s: 300 + # Number of times to check Port Status + num-checks-link-state: 10 + +# Level 2 - Checks All Reduce between node pairs in the cluster. +level-2: + run: true + timeout_s: 180 + # Number of times to check Network connections between nodes + num-rounds: 5 +``` + +Before running the screening test, you need to generate the ssh key used for passwordless ssh: + +``` bash +# Keys to setup initial bare-metal passwordless ssh connection between systems +ssh-keygen -t rsa -f ssh/hhs_rsa +chmod 600 ssh/hhs_rsa; +chmod 644 ssh/hhs_rsa.pub; + +# Keys to setup containers passwordless ssh connection +ssh-keygen -t rsa -f template/bare-metal/ssh/id_rsa +chmod 600 template/bare-metal/ssh/id_rsa; +chmod 644 template/bare-metal/ssh/id_rsa.pub; + +cat template/bare-metal/ssh/id_rsa.pub > template/bare-metal/sshauthorized_keys +``` + +## Recovery Steps + +| Issue | Description | +| ------------------------- | --------------------------------------------------------------------------------------- | +| Down Internal Links | Need to investigate Gaudi Card Health | +| Down External Links | Check Cable, switches, and Gaudi Card Health | +| QPC Issues | Network Configuration issue (stale gaudinet.json, stale NIC configurations, etc... ) | +| Missing Cards | Need to investigate Gaudi Card Health | +| k8s Issues | Node Resources are not set/configured properly | diff --git a/utils/habana_health_screen/config.yaml b/utils/habana_health_screen/config.yaml new file mode 100644 index 0000000..cc984e5 --- /dev/null +++ b/utils/habana_health_screen/config.yaml @@ -0,0 +1,35 @@ +# Sets HHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal). k8s does not require any system info +system-info: + type: "k8s" + # Namespace is only required for k8s settings + namespace: "habanalabs" + + # Can specify specific systems. For k8s, to scan entire cluster comment out hostfile + # hostfile: "./hostfile" + + # Bare Metal Configurations + ssh-path: "./ssh" + tcp-interface: "10.3.124.0/24" + +# Image to run Habana Health Screen +image: "vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest" + +# Node Label used to identify a Gaudi Node +gaudi-node-label: "hhs_label=gaudi" + +# Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL) +log-level: "DEBUG" + +# Level 1 - Checks Individual Node Health (Ports status, Device Busy, Device Acquire failure) +level-1: + run: true + timeout_s: 300 + # Number of times to check Port Status + num-checks-link-state: 12 + +# Level 2 - Checks All Reduce between node pairs in the cluster. +level-2: + run: true + timeout_s: 100 + # Number of times to check Network connections between nodes + num-rounds: 5 \ No newline at end of file diff --git a/utils/habana_health_screen/hccl_demo_helper.py b/utils/habana_health_screen/hccl_demo_helper.py new file mode 100644 index 0000000..3525ac9 --- /dev/null +++ b/utils/habana_health_screen/hccl_demo_helper.py @@ -0,0 +1,216 @@ +# Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random, math, os, yaml, glob + +import logging +_logger = logging.getLogger("habana_health_screener") + +def find_groups(nodes_to_test, groups_tracker): + """ Find a list of node groups to run hccl_demo all reduce test + + Args: + nodes_to_test ([str]): Nodes list used to create a group of nodes for hccl_demo + groups_tracker ([str]): History of used groups. A group has to be unique + + Returns: + ([str],[str]): Unique list of groups of nodes, History of used groups + """ + random.shuffle(nodes_to_test) + + found_unique = True + num_nodes = len(nodes_to_test) + node_groups = list() + max_num_groups = num_nodes // 2 + max_combinations = (math.factorial(num_nodes)) / (math.factorial(num_nodes-2) * 2) + _logger.debug(f"nodes_to_test {len(nodes_to_test)}: {nodes_to_test}") + + def add_unique_group_id(interval=2): + nonlocal node_groups, nodes_to_test + i = 1 + max_attempts = 10 + found_unique = False + + if len(groups_tracker) >= max_combinations: + _logger.info(f"Reached maximum combinations {max_combinations} for {num_nodes} Nodes") + return found_unique + + node_group, group_id = find_group_id(nodes_to_test, i, interval=interval) + + while group_id in groups_tracker: + if i > max_attempts: + _logger.warn("Max attempt {max_attempt} reached for finding unique pair combination.") + return found_unique + + node_group, group_id = find_group_id(nodes_to_test, i, interval=interval) + if group_id == "": + return found_unique + + i += 1 + + found_unique = True + groups_tracker.append(group_id) + node_groups.append(node_group) + + for n in node_group: + nodes_to_test.remove(n) + + return found_unique + + + if num_nodes == 1: + _logger.warn(f"Need more than 1 Node to test all_reduce") + return False + + if num_nodes % 2 != 0: + # Ensures that every node has a group to test. + found_unique = add_unique_group_id(interval=3) + + while len(node_groups) < max_num_groups and found_unique: + found_unique = add_unique_group_id() + + if not found_unique: + _logger.debug(f"Finished searching for Unique pair combinations") + break + + return node_groups, groups_tracker + +def find_group_id(nodes_to_test, start, interval=2): + """ Finds a group of nodes and combines to form a group id + + Args: + nodes_to_test ([str]): Viable node list + start (int): Index of next potential node id + interval (int, optional): The size of the group id. Most common is pairs of nodes. Defaults to 2. + + Returns: + ([str], str): Potential nodes and their group id + """ + group_id = "" + + if len(nodes_to_test) == 0: + return [], group_id + + node_group = [nodes_to_test[0]] + node_group.extend(nodes_to_test[start:start+interval-1]) + + if len(node_group) > 1: + node_group.sort() + group_id = "".join(node_group) + + return node_group, group_id + +def gather_hccl_logs(job_path, round, log_dir, health_report): + """ Retrieve hccl_demo log files based on the job yamls executed + + Args: + job_path (str): Base directory of job yamls executed + round (int): Round to retrieve HCCL_Demo logs + log_dir (str): Base directory of HCCL_Demo logs + health_report (HabanaHealthReport): Tracks and reports health of hccl_demo + """ + path = f"{job_path}/**/r{round}/*.yaml" + job_files = glob.glob(path, recursive=True) + hccl_results = dict() + + for f_name in job_files: + with open(f_name, 'r', newline='') as f: + job_data = yaml.safe_load(f) + + launcher_template = job_data["spec"]["mpiReplicaSpecs"]["Launcher"]["template"] + + job_id = launcher_template["metadata"]["labels"]["name"] + target_nodes = launcher_template["spec"]["containers"][0]["env"][4]["value"] + target_nodes = target_nodes.split(',') + + hccl_results[f"{target_nodes}"] = hccl_demo_check(job_id=f"{log_dir}/L2/r{round}/{job_id}", + target_nodes=target_nodes, health_report=health_report, write=False) + + multi_node_fail = set() + qpc_fail = set() + missing_nodes = set() + + for results in hccl_results.values(): + if results["multi_node_fail"]: + multi_node_fail.add(f"{results['node_ids']}") + + if results["qpc_fail"]: + qpc_fail.add(f"{results['node_ids']}") + + if results["missing"]: + missing_nodes.add(f"{results['node_ids']}") + + health_report.update_hccl_demo_health_report(round=round, all_node_pairs=hccl_results, multi_node_fail=multi_node_fail, qpc_fail=qpc_fail, missing_nodes=missing_nodes) + +def hccl_demo_check(job_id, target_nodes, health_report, write=True): + """ Check on HCCL Demo Status. Reads the output log, if it + has "Exiting HCCL demo with code: 1" then it is treated as a + failure + + Args: + job_id (str): Metadata name of the Job + target_nodes ([str]): Nodes that are used in hccl_demo testing + health_report (HabanaHealthReport): Tracks and reports health of hccl_demo + write (bool, optional): Writes to Report. Used to collect hccl results and update Base Health Report. Default to True + + Returns: + dict: HCCL Demo Health Report result data. + """ + f_name_log = f"{job_id}.log" + err_phrase = "Exiting HCCL demo with code: 1" + err_phrase_other = "During handling of the above exception, another exception occurred:" + err_phrase_ssh = "ssh: Could not resolve hostname" + err_phrase_qpc = "Source: QPC, error" + pass_phrase = "Bandwidth" + round = os.path.basename(job_id).split("-")[2][1:] + group_id = os.path.basename(job_id).split("-")[3] + missing_logs = False + hccl_demo_fail = True + missing = False + qpc_fail = False + target_nodes.sort() + + if not os.path.exists(f_name_log): + _logger.error(f"{f_name_log} can't be found") + hccl_demo_fail = True + missing = True + missing_logs = True + + if not missing_logs: + with open(f_name_log, "r", newline='') as f: + lines = f.readlines() + for l in lines: + if l.find(err_phrase_ssh) != -1: + hccl_demo_fail = True + missing = True + elif l.find(err_phrase_qpc) != -1: + hccl_demo_fail = True + qpc_fail = True + elif l.find(err_phrase) != -1 or l.find(err_phrase_other) != -1: + hccl_demo_fail = True + elif l.find(pass_phrase) != -1: + hccl_demo_fail = False + + data = { + "round": round, + "group_id": group_id, + "node_ids": target_nodes, + "num_nodes": len(target_nodes), + "multi_node_fail": hccl_demo_fail, + "missing": missing, + "qpc_fail": qpc_fail + } + + if write: + health_report.write_rows(data=[data], level=2) + + return data \ No newline at end of file diff --git a/utils/habana_health_screen/hostfile b/utils/habana_health_screen/hostfile new file mode 100644 index 0000000..a9ddeb3 --- /dev/null +++ b/utils/habana_health_screen/hostfile @@ -0,0 +1,2 @@ +10.3.124.114 +10.3.124.111 \ No newline at end of file diff --git a/utils/habana_health_screen/run_hhs.sh b/utils/habana_health_screen/run_hhs.sh new file mode 100644 index 0000000..c031e97 --- /dev/null +++ b/utils/habana_health_screen/run_hhs.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +LOG_DIR=logs/$(date +'%m-%d-%Y/%m-%d-%Y_%H-%M') + +python screen.py --initialize --logs-dir $LOG_DIR; +python screen.py --screen --logs-dir $LOG_DIR; diff --git a/utils/habana_health_screen/screen.py b/utils/habana_health_screen/screen.py new file mode 100644 index 0000000..9ad8eb2 --- /dev/null +++ b/utils/habana_health_screen/screen.py @@ -0,0 +1,162 @@ +# Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os, datetime, shutil, yaml, sys +import argparse +import logging + +from utilities import download_repos, clear_hhs_pods, create_logger, get_logging_level +from hccl_demo_helper import hccl_demo_check +from system_utils import KubeUtils, BareMetalUtils + +from HabanaHealthReport import HabanaHealthReport +from HNodes import HNodes, HNode + + +_logger = None + + +def main(args): + global _logger + + if args.logs_dir == "": + c_time = datetime.datetime.now() + date_year_format = c_time.strftime("%m-%Y") + date_format = c_time.strftime("%m-%d-%Y") + time_format = c_time.strftime("%H-%M") + args.logs_dir = f"logs/{date_year_format}/{date_format}/{date_format}_{time_format}" + + + hhs_report_name = "health_report.csv" + hhs_log_dir = args.logs_dir + + if args.node_name: + hhs_level = os.environ["HHS_LEVEL"] + hhs_report_name = f"health_report_{args.node_name}.csv" + hhs_log_dir = f"{args.logs_dir}/L{hhs_level}" + + health_report = HabanaHealthReport(f_dir=hhs_log_dir, report_name=hhs_report_name) + job_path = "tmp/jobs" + + with open(args.config, 'r') as f: + config_data = yaml.safe_load(f) + + log_level = get_logging_level(config_data["log-level"]) + _logger, _ = create_logger(logger_name="habana_health_screener", logger_file_name="screener", f_path=args.logs_dir, level=log_level) + + if config_data["system-info"]["type"] == "k8s": + system_mode = KubeUtils(image=config_data["image"], + hostfile=config_data["system-info"]["hostfile"], + namespace=config_data["system-info"]["namespace"], + log_dir=args.logs_dir) + elif config_data["system-info"]["type"] == "bare-metal": + + system_mode = BareMetalUtils(image=config_data["image"], + hostfile=config_data["system-info"]["hostfile"], + ssh_path=config_data["system-info"]["ssh-path"], + tcp_interface=config_data["system-info"]["tcp-interface"], + log_dir=args.logs_dir) + else: + _logger.error(f"system_mode: {system_mode} in {args.config} is not set correctly. system_mode has to be set to k8s or bare-metal") + sys.exit(1) + + + if args.initialize: + _logger.info(f"Loaded Configuration File: {args.config}") + _logger.info(f"{config_data}") + + health_report.create(create_base=True, create_hccl_demo=True) + download_repos() + + system_mode.initialize_system() + + if args.screen: + start_time = datetime.datetime.now() + + habana_nodes = HNodes(health_report=health_report) + habana_nodes.all_nodes = system_mode.collect_nodes(gaudi_node_label=config_data["gaudi-node-label"]) + + if config_data["level-1"]["run"]: + _logger.info("Running Level 1 Checks: Card Diagnostics") + if not os.path.exists(f"{health_report.f_dir}/L1"): + os.makedirs(f"{health_report.f_dir}/L1") + + system_mode.initialize_node_jobs(level=1, + nodes=habana_nodes, + job_base_path=job_path) + healthy_nodes, infected_nodes, missing_nodes = system_mode.monitor_hhs_status(level=1, + nodes=habana_nodes, + timeout_s=config_data["level-1"]["timeout_s"]) + system_mode.diagnose_unhealthy_nodes(infected_nodes, missing_nodes) + + system_mode.clear_hhs_pods() + + if config_data["level-2"]["run"]: + _logger.info("Running Level 2 Checks: Pair HCCL_DEMO All Reduce") + if not os.path.exists(f"{health_report.f_dir}/L2"): + os.makedirs(f"{health_report.f_dir}/L2") + + for i in range(config_data["level-2"]["num-rounds"]): + system_mode.initialize_node_jobs(level=2, + nodes=habana_nodes, + job_base_path=job_path, + round=i) + healthy_nodes, infected_nodes, missing_nodes = system_mode.monitor_hhs_status(level=2, + nodes=habana_nodes, + timeout_s=config_data["level-2"]["timeout_s"], + round=i) + system_mode.diagnose_unhealthy_nodes(infected_nodes, missing_nodes) + + system_mode.clear_hhs_pods(job_type="mpijobs") + + if len(infected_nodes) == 0: + _logger.info(f"Round {i}/{config_data['level-2']['num-rounds']}: No Infected Nodes found. Exit screening early.") + break + + + end_time = datetime.datetime.now() + diff_time = (end_time - start_time) + _logger.info(f"Total Run Time: {diff_time}") + + if args.hhs_check == "node": + node = HNode(health_report=health_report, + num_checks_link_state=config_data["level-1"]["num-checks-link-state"], + log_level=log_level) + node.scan_cards() + node.health_check(write_report=args.node_write_report) + elif args.hhs_check == "hccl-demo": + health_report.create(create_base=False, create_hccl_demo=True) + + target_nodes = args.target_nodes.strip("[']").replace("'","").split(',') + hccl_demo_check(job_id=f"{health_report.f_dir}/L2/{args.round}/{args.job_id}", + target_nodes=target_nodes, health_report=health_report) + +if __name__=="__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--initialize", action="store_true", help="Downloads Necessary Repos and Creates Report Template") + parser.add_argument("--screen", action="store_true", help="Starts Health Screen for Cluster") + parser.add_argument("--target-nodes", type=str, default="", help="List of target nodes") + parser.add_argument("--job-id", type=str, default="", help="Needed to identify hccl-demo running log") + parser.add_argument("--round", type=str, default="", help="Needed to identify hccl-demo running round log") + parser.add_argument("--config", type=str, default="config.yaml", help="Configuration file for Health Screener") + parser.add_argument("--hhs-check", default="none", const="none", nargs="?", choices=["node", "hccl-demo", "none"], + help="Check HHS Status for Node (Ports status, Device Acquire Fail) or all_reduce (HCCL_DEMO between paris of nodes)") + + parser.add_argument("--node-write-report", action="store_true", help="Write Individual Node Health Report") + parser.add_argument("--node-name", type=str, default="", help="Name of Node") + parser.add_argument("--logs-dir", type=str, default="", help="Output directory of health screen results") + + args = parser.parse_args() + + + main(args) diff --git a/utils/habana_health_screen/system_utils.py b/utils/habana_health_screen/system_utils.py new file mode 100644 index 0000000..2d530c3 --- /dev/null +++ b/utils/habana_health_screen/system_utils.py @@ -0,0 +1,551 @@ +# Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time, os, shutil, yaml, glob +import logging + +from utilities import run_cmd, copy_files +from hccl_demo_helper import find_groups, gather_hccl_logs + + +_logger = logging.getLogger("habana_health_screener") + + +class SystemUtils(): + + def __init__(self, image, log_dir, remote_path="/tmp/hhs"): + self.job_path = "tmp/jobs" + self.image = image + self.log_dir = log_dir + self.remote_path = remote_path + + def clear_jobs(self): + if not os.path.exists(self.job_path): + os.makedirs(self.job_path) + + _logger.info(f"Clearing out {self.job_path}") + for f in os.listdir(self.job_path): + full_path = os.path.join(self.job_path, f) + if os.path.isdir(full_path): + shutil.rmtree(full_path) + else: + os.remove(full_path) + + def extract_host(self, hostfile): + hosts = list() + with open(hostfile, "r") as f: + hosts = [l.strip() for l in f] + + return hosts + + def monitor_hhs_status(self, level, nodes, timeout_s=240, round=0, monitor=True): + is_finished = False + attempt = 0 + max_attempts = (timeout_s // 10) + min(timeout_s % 10, 1) + hccl_demo = (level == 2) + + if len(nodes.healthy_nodes) > 0: + num_nodes = len(nodes.healthy_nodes) + else: + num_nodes = len(nodes.all_nodes) + + _logger.info(f"Checking HHS Level {level} Status") + + if monitor: + for attempt in range(max_attempts): + is_finished, num_found_nodes = nodes.health_report.check_screen_complete(num_nodes=num_nodes, hccl_demo=hccl_demo, round=round) + + if is_finished: + _logger.info(f"Found {num_found_nodes}/{num_nodes} Nodes during Health Screen") + + # Gives time for cleanup between rounds + time.sleep(10) + break + + _logger.info(f"Attempt {attempt}/{max_attempts}: Found {num_found_nodes}/{num_nodes} Nodes - Will Check again in 10 seconds") + time.sleep(10) + + if level == 2: + gather_hccl_logs(job_path=self.job_path, + round=round, + log_dir=self.log_dir, + health_report=nodes.health_report) + + else: + hosts = nodes.all_nodes + if len(nodes.launcher_nodes) > 0: + hosts = nodes.launcher_nodes + + nodes.health_report.gather_health_report(level, remote_path="/tmp/hhs", hosts=hosts) + nodes.health_report.consolidate_health_report(level=level, report_dir=f"{self.log_dir}") + + if level == 1: + detected_nodes, infected_nodes, missing_nodes = nodes.health_report.extract_node_info() + missing_nodes.update(set(nodes.all_nodes).difference(detected_nodes)) + + nodes.health_report.update_health_report(detected_nodes=detected_nodes, infected_nodes=infected_nodes, missing_nodes=missing_nodes) + elif level == 2: + detected_nodes, infected_nodes, missing_nodes = nodes.health_report.extract_hccl_demo_info() + nodes.health_report.update_health_report(detected_nodes=detected_nodes, infected_nodes=infected_nodes, missing_nodes=missing_nodes) + + detected_nodes_l1, infected_nodes_l1, missing_nodes = nodes.health_report.extract_node_info() + detected_nodes.update(detected_nodes_l1) + infected_nodes.update(infected_nodes_l1) + + healthy_nodes = detected_nodes.difference(infected_nodes).difference(missing_nodes) + + healthy_nodes = sorted(list(healthy_nodes)) + missing_nodes = sorted(list(missing_nodes)) + infected_nodes = sorted(list(infected_nodes)) + detected_nodes = sorted(list(detected_nodes)) + + if level == 1: + nodes.healthy_nodes = healthy_nodes + + _logger.info(f"Infected {len(infected_nodes)} Node: {infected_nodes}") + _logger.info(f"Missing {len(missing_nodes)} Node: {missing_nodes}") + _logger.info(f"Healthy {len(healthy_nodes)} Node: {healthy_nodes}") + _logger.info(f"Detected {len(detected_nodes)} Node: {detected_nodes}") + + return healthy_nodes, infected_nodes, missing_nodes + + +class KubeUtils(SystemUtils): + + def __init__(self, image, hostfile, namespace, log_dir): + super().__init__(image, log_dir) + self.namespace = namespace + self.hostfile = hostfile + + def initialize_system(self): + self.clear_hhs_pods() + self.clear_hhs_pods(job_type="mpijobs") + self.clear_jobs() + + def collect_nodes(self, gaudi_node_label): + if self.hostfile: + all_nodes = self.extract_host(self.hostfile) + else: + gaudi_label_id = f"{gaudi_node_label} -o=custom-columns='NAME:.metadata.name' --no-headers" + + cmd = f"kubectl get nodes -l={gaudi_label_id}" + output = run_cmd(cmd) + all_nodes = output.strip().split() + + _logger.info(f"Collected Nodes: {all_nodes}") + + return all_nodes + + def initialize_node_jobs(self, level, + nodes, + job_base_path="tmp/jobs", + round=0): + update_val = { + "metadata-name": "", + "round": round, + "container-image": self.image, + "num-nodes": "", + "target-nodes": "" + } + + if level == 1: + source_f = "template/k8s/pt-habana-health-screen-L1.yaml" + update_val["num-nodes"] = len(nodes.all_nodes) + update_val["target-nodes"] = nodes.all_nodes + node_groups = nodes.all_nodes + job_path = f"{job_base_path}/L1" + yaml_type = "job" + elif level == 2: + source_f = "template/k8s/pt-habana-health-screen-L2_hccl-demo.yaml" + yaml_type = "mpijob" + + if len(nodes.healthy_nodes) > 0: + nodes_to_test = nodes.healthy_nodes + else: + nodes_to_test = nodes.all_nodes.copy() + + node_groups, nodes.groups_tracker = find_groups(nodes_to_test, nodes.groups_tracker) + job_path = f"{job_base_path}/L2/r{round}" + + for i, node_group in enumerate(node_groups): + if level == 1: + update_val["metadata-name"] = f"hhs-{node_group}" + update_val["target-nodes"] = [node_group] + out_file = f"{node_group}.yaml" + elif level == 2: + update_val["metadata-name"] = f"hhs-hccl-r{round}-{i}" + update_val["target-nodes"] = node_group + update_val["num-nodes"] = len(node_group) + out_file = f"{update_val['metadata-name']}.yaml" + + self.update_yaml_job(source_file=source_f, + update_val=update_val, + out_dir=job_path, + out_file=out_file, + yaml_type=yaml_type) + + _logger.info(f"Launching Level {level} Jobs at {job_path}") + cmd = f"kubectl apply -f {job_path}" + output = run_cmd(cmd) + _logger.debug(f"Applying job output: {output}") + + + def update_yaml_job(self, update_val={}, + source_file="template/k8s/pt-habana-health-screen-L1.yaml", + out_dir="tmp/jobs", + out_file="default.yaml", + yaml_type="job"): + with open(source_file, 'r') as f: + template_data = yaml.safe_load(f) + + template_data["metadata"]["name"] = update_val["metadata-name"] + template_data["metadata"]["namespace"] = self.namespace + + if yaml_type == "job": + replicas_specs = template_data["spec"]["template"]["spec"] + + replicas_specs["volumes"][0]["hostPath"]["path"] = os.getcwd() + replicas_specs["containers"][0]["image"] = update_val["container-image"] + replicas_specs["containers"][0]["name"] = update_val["metadata-name"] + replicas_specs["containers"][0]["env"].append({"name": "LOG_DIR", "value": self.log_dir}) + + worker_selector_expression = replicas_specs["affinity"]["nodeAffinity"]["requiredDuringSchedulingIgnoredDuringExecution"]["nodeSelectorTerms"] + elif yaml_type == "mpijob": + replicas_specs = template_data["spec"]["mpiReplicaSpecs"] + replicas_specs["Launcher"]["template"]["metadata"]["labels"]["name"] = update_val["metadata-name"] + + launcher_data = replicas_specs["Launcher"]["template"]["spec"] + launcher_data["volumes"][0]["hostPath"]["path"] = os.getcwd() + launcher_data["containers"][0]["image"] = update_val["container-image"] + launcher_data["containers"][0]["env"].append({"name": "TARGET_NODES", "value": ','.join(update_val['target-nodes'])}) + launcher_data["containers"][0]["env"].append({"name": "LOG_DIR", "value": self.log_dir}) + launcher_data["containers"][0]["env"].append({"name": "ROUND", "value": f"r{update_val['round']}"}) + launcher_data["containers"][0]["env"].append({"name": "NUM_NODES", "value": f"{update_val['num-nodes']}"}) + + replicas_specs["Worker"]["replicas"] = update_val['num-nodes'] + worker_data = replicas_specs["Worker"]["template"]["spec"] + worker_data["volumes"][0]["hostPath"]["path"] = os.getcwd() + worker_data["containers"][0]["image"] = update_val["container-image"] + + worker_selector_expression = worker_data["affinity"]["nodeAffinity"]["requiredDuringSchedulingIgnoredDuringExecution"]["nodeSelectorTerms"] + + worker_selector_expression[0]["matchExpressions"][0]["values"] = update_val["target-nodes"] + + out_f = f"{out_dir}/{out_file}" + dir_name = os.path.dirname(out_f) + if not os.path.exists(dir_name): + os.makedirs(dir_name) + + with open(out_f, 'w+') as f: + yaml.dump(template_data, f) + + _logger.info(f"Created Yaml: {out_f}") + + return out_f + + def clear_hhs_pods(self, job_type="jobs"): + """ Clear Pods with label=hhs,hhs-hccl + + Args: + job_type (str, optional): Type of Job to delete. Options: [jobs, mpijobs]. Defaults to "jobs". + """ + _logger.info(f"Checking for existing HHS Pods ({job_type})") + + metadata_app = "hhs" if (job_type == "jobs") else "hhs-hccl" + + cmd = f"kubectl get pods -n {self.namespace} -l app={metadata_app} -o=custom-columns='NAME:.metadata.name' --no-headers" + output = run_cmd(cmd).strip() + + if len(output) > 0: + _logger.info(f"Found existing HHS Pods ({job_type}). Will delete.") + + cmd = f"kubectl get {job_type} -n {self.namespace} -l app={metadata_app} -o=custom-columns='NAME:.metadata.name' --no-headers" + output = run_cmd(cmd).strip() + jobs = output.split() + + _logger.info(f"Deleting jobs {jobs}") + for job in jobs: + cmd = f"kubectl delete {job_type} -n {self.namespace} {job}" + output = run_cmd(cmd) + + cmd = f"kubectl get pods -n {self.namespace} -l app={metadata_app} -o=custom-columns='NAME:.metadata.name' --no-headers" + max_attempt = 15 + for attempts in range(max_attempt): + output = run_cmd(cmd).strip() + + if len(output) == 0: + break + + _logger.info(f"Attempt {attempts}: Pods are still up. Will wait 10 seconds to check again") + time.sleep(10) + + def diagnose_unhealthy_nodes(self, infected_nodes, missing_nodes): + in_use_set = set() + missing_cards_set = set() + misc_set = set() + + for n in missing_nodes: + cmd = f"kubectl describe nodes -n {self.namespace} {n}" + output = run_cmd(cmd).strip() + output_arr = output.split("\n") + + reach_allocatable = False + reach_allocatable_resources = False + for l in output_arr: + if("Allocatable:" in l): + reach_allocatable = True + if(reach_allocatable and "habana.ai/gaudi: " in l): + num_gaudis = int(l.split()[1]) + if num_gaudis < 8: + missing_cards_set.add(n) + break + + if("Allocated resources:" in l): + reach_allocatable_resources = True + if(reach_allocatable_resources and "habana.ai/gaudi" in l): + num_gaudis = int(l.split()[1]) + if num_gaudis > 0: + in_use_set.add(n) + break + + in_use_list = sorted(list(in_use_set)) + missing_cards_list = sorted(list(missing_cards_set)) + misc_list = sorted(list(set(missing_nodes).difference(in_use_set).difference(missing_cards_set))) + + if(len(in_use_list)): + _logger.info(f"{len(in_use_list)} Occupied Nodes: {in_use_list}") + if(len(missing_cards_list)): + _logger.info(f"{len(missing_cards_list)} Nodes w/ missing cards: {missing_cards_list}") + if(len(misc_list)): + _logger.info(f"{len(misc_list)} Unaccounted Nodes: {misc_list}") + + + +class BareMetalUtils(SystemUtils): + + def __init__(self, + image, + hostfile, + ssh_path, + tcp_interface, + log_dir, + docker_compose_f="template/pt-hhs-docker-compose-L1.yaml"): + super().__init__(image, log_dir, remote_path="/tmp/hhs") + + self.hostfile = hostfile + self.ssh_path = ssh_path + self.tcp_interface = tcp_interface + self.docker_compose_f = docker_compose_f + self.docker_compose_alias = "docker compose" + + self.hosts = self.extract_host(self.hostfile) + + os.environ["PDSH_RCMD_TYPE"] = "ssh" + + self.pdsh_cmd = f"pdsh -w ^{self.hostfile}" + self.docker_compose_cmd = f"{self.pdsh_cmd} {self.docker_compose_alias}" + + self.initialize_ssh() + + def initialize_ssh(self): + _logger.debug("Activating ssh-agent") + cmd = f"ssh-agent -s" + output = run_cmd(cmd) + + _logger.debug("Adding hhs private key to ssh-agent") + cmd = f"ssh-add {self.ssh_path}/hhs_rsa" + output = run_cmd(cmd) + + + def initialize_system(self): + self.clear_hhs_pods() + self.clear_hhs_pods(job_type="mpijobs") + self.clear_jobs() + self.clear_remote_jobs() + + _logger.info(f"Setting up ssh connection for hosts: {self.hosts}") + for h in self.hosts: + cmd = f"ssh-copy-id -o StrictHostKeyChecking=no -i {self.ssh_path}/hhs_rsa.pub {os.environ['USER']}@{h}" + output = run_cmd(cmd) + + self.initialize_ssh() + copy_files(src="../", dst=f"{self.remote_path}", exclude={"logs", "ssh", "tmp"}, hosts=self.hosts) + + + def collect_nodes(self, gaudi_node_label=""): + _logger.info(f"Collected Nodes: {self.hosts}") + + return self.hosts + + def initialize_node_jobs(self, level, + nodes, + job_base_path="tmp/jobs", + round=0): + update_val = { + "metadata-name": "", + "round": round, + "container-image": self.image, + "num-nodes": "", + "target-nodes": "", + "master-node": "" + } + + if level == 1: + update_val["num-nodes"] = len(nodes.all_nodes) + update_val["target-nodes"] = nodes.all_nodes + node_groups = nodes.all_nodes + job_path = f"{job_base_path}/L1" + elif level == 2: + if len(nodes.healthy_nodes) > 0: + nodes_to_test = [n.replace("hhs-","").replace(":48","") for n in nodes.healthy_nodes] + else: + nodes_to_test = nodes.all_nodes.copy() + + node_groups, nodes.groups_tracker = find_groups(nodes_to_test, nodes.groups_tracker) + job_path = f"{job_base_path}/L2/r{round}" + nodes.launcher_nodes = list() + nodes.worker_nodes = list() + + self.update_yaml_job(source_file="config.yaml", out_dir="tmp", out_file="config.yaml", yaml_type="config") + for i, node_group in enumerate(node_groups): + if level == 1: + update_val["metadata-name"] = f"{node_group}" + update_val["target-nodes"] = [node_group] + + self.update_yaml_job(update_val=update_val, out_dir=job_path) + + copy_files(src="tmp/jobs", dst=f"{self.remote_path}", hosts=update_val["target-nodes"]) + copy_files(src="template/bare-metal/dockerfile", dst=f"{self.remote_path}/jobs/L1", hosts=update_val["target-nodes"]) + copy_files(src="./ssh", dst=f"{self.remote_path}/jobs/L1", hosts=update_val["target-nodes"]) + copy_files(src="tmp/config.yaml", dst=f"{self.remote_path}/habana_health_screen", hosts=update_val["target-nodes"]) + + elif level == 2: + update_val["metadata-name"] = f"hhs-hccl-r{round}-{i}" + update_val["target-nodes"] = node_group + update_val["master-node"] = node_group[0] + update_val["num-nodes"] = len(node_group) + + self.update_yaml_job(source_file="template/bare-metal/pt-hhs-docker-compose-L2-launcher.yaml", + update_val=update_val, + out_dir=job_path, + out_file=f"pt-hhs-docker-compose-L2-launcher.yaml", + yaml_type="mpijob_launcher") + + self.update_yaml_job(source_file="template/bare-metal/pt-hhs-docker-compose-L2-worker.yaml", + update_val=update_val, + out_dir=job_path, + out_file=f"pt-hhs-docker-compose-L2-worker.yaml", + yaml_type="mpijob_worker") + nodes.launcher_nodes.append(node_group[0]) + nodes.worker_nodes.extend(node_group[1:]) + + copy_files(src="tmp/jobs", dst=f"{self.remote_path}", hosts=update_val["target-nodes"]) + copy_files(src="template/bare-metal/dockerfile", dst=f"{self.remote_path}/jobs/L2/r{round}", hosts=update_val["target-nodes"]) + copy_files(src="template/bare-metal/ssh", dst=f"{self.remote_path}/jobs/L2/r{round}", hosts=update_val["target-nodes"]) + copy_files(src="tmp/config.yaml", dst=f"{self.remote_path}/habana_health_screen", hosts=update_val["target-nodes"]) + + + _logger.info(f"Launching Level {level} Jobs at {job_path}") + + if level == 1: + cmd = f"{self.docker_compose_cmd} -f {self.remote_path}/jobs/L1/pt-hhs-docker-compose-L1.yaml up" + output = run_cmd(cmd).strip() + elif level == 2: + with open(f"{job_base_path}/L2/r{round}/hostfile_launchers", mode='wt', encoding='utf-8') as f: + f.write('\n'.join(nodes.launcher_nodes)) + with open(f"{job_base_path}/L2/r{round}/hostfile_workers", mode='wt', encoding='utf-8') as f: + f.write('\n'.join(nodes.worker_nodes)) + + cmd_list = [ + f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_workers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/pt-hhs-docker-compose-L2-worker.yaml build", + f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_workers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/pt-hhs-docker-compose-L2-worker.yaml up -d --remove-orphans", + f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_launchers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/pt-hhs-docker-compose-L2-launcher.yaml build", + f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_launchers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/pt-hhs-docker-compose-L2-launcher.yaml up --remove-orphans" + ] + + for cmd in cmd_list: + output = run_cmd(cmd).strip() + + def update_yaml_job(self, + source_file="template/bare-metal/pt-hhs-docker-compose-L1.yaml", + out_dir="tmp/jobs", + out_file="pt-hhs-docker-compose-L1.yaml", + update_val={}, + yaml_type="job"): + with open(source_file, 'r') as f: + template_data = yaml.safe_load(f) + + if yaml_type == "job": + template_data["services"]["hhs_level1"]["build"]["args"]["BASE_IMAGE"] = self.image + + template_data["services"]["hhs_level1"]["environment"].append(f"MY_NODE_NAME={update_val['metadata-name']}") + template_data["services"]["hhs_level1"]["environment"].append(f"LOG_DIR={self.log_dir}") + elif yaml_type == "mpijob_launcher": + template_data["services"]["hhs_level2_launcher"]["build"]["args"]["BASE_IMAGE"] = self.image + + template_data["services"]["hhs_level2_launcher"]["environment"].append(f"MY_NODE_NAME={update_val['metadata-name']}") + template_data["services"]["hhs_level2_launcher"]["environment"].append(f"LOG_DIR={self.log_dir}") + template_data["services"]["hhs_level2_launcher"]["environment"].append(f"ROUND=r{update_val['round']}") + template_data["services"]["hhs_level2_launcher"]["environment"].append(f"NUM_NODES={update_val['num-nodes']}") + template_data["services"]["hhs_level2_launcher"]["environment"].append(f'TARGET_NODES={",".join(update_val["target-nodes"])}') + template_data["services"]["hhs_level2_launcher"]["environment"].append(f"MASTER_ADDR={update_val['master-node']}") + template_data["services"]["hhs_level2_launcher"]["environment"].append(f"TCP_INTERFACE={self.tcp_interface}") + template_data["services"]["hhs_level2_launcher"]["environment"].append(f"JOB_ID={update_val['metadata-name']}") + elif yaml_type == "mpijob_worker": + template_data["services"]["hhs_level2_worker"]["build"]["args"]["BASE_IMAGE"] = self.image + template_data["services"]["hhs_level2_worker"]["environment"].append(f"MY_NODE_NAME={update_val['metadata-name']}") + template_data["services"]["hhs_level2_worker"]["environment"].append(f"LOG_DIR={self.log_dir}") + template_data["services"]["hhs_level2_worker"]["environment"].append(f"JOB_ID={update_val['metadata-name']}") + elif yaml_type == "config": + hostfile = template_data["system-info"]["hostfile"] + ssh_path = template_data["system-info"]["ssh-path"] + template_data["system-info"]["hostfile"] = f"/tmp/hhs/habana_health_screen/{os.path.basename(hostfile)}" + template_data["system-info"]["ssh-path"] = f"/tmp/hhs/habana_health_screen/{os.path.basename(ssh_path)}" + + out_f = f"{out_dir}/{out_file}" + dir_name = os.path.dirname(out_f) + if not os.path.exists(dir_name): + os.makedirs(dir_name) + + with open(out_f, 'w+') as f: + yaml.dump(template_data, f) + + _logger.info(f"Created Yaml: {out_f}") + + def monitor_hhs_status(self, level, nodes, timeout_s=240, round=0, monitor=True): + return super().monitor_hhs_status(level=level, nodes=nodes, timeout_s=timeout_s, round=round, monitor=False) + + def clear_hhs_pods(self, job_type="jobs"): + work_dir = f"{self.remote_path}/jobs" + + if job_type == "jobs": + cmd = f"{self.docker_compose_cmd} -f {work_dir}/L1/pt-hhs-docker-compose-L1.yaml down" + output = run_cmd(cmd).strip() + else: + files = glob.glob(f"{work_dir}/L2/**/*.yaml", recursive=True) + _logger.debug(f"Files to clear: {files}") + for f in files: + dir_name = os.path.dirname(f) + + if "launcher" in f: + cmd = f"pdsh -w ^{dir_name}/hostfile_launchers {self.docker_compose_alias} -f {f} down" + elif "worker" in f: + cmd = f"pdsh -w ^{dir_name}/hostfile_workers {self.docker_compose_alias} -f {f} down" + + output = run_cmd(cmd).strip() + + def clear_remote_jobs(self): + cmd = f"{self.pdsh_cmd} rm -R /tmp/hhs/jobs/" + output = run_cmd(cmd) + + def diagnose_unhealthy_nodes(self, infected_nodes, missing_nodes): + pass diff --git a/utils/habana_health_screen/template/bare-metal/dockerfile b/utils/habana_health_screen/template/bare-metal/dockerfile new file mode 100644 index 0000000..e57131c --- /dev/null +++ b/utils/habana_health_screen/template/bare-metal/dockerfile @@ -0,0 +1,19 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +RUN mkdir ~/.ssh && \ +cd ~/.ssh && \ +sed -i 's/#Port 22/Port 3122/g' /etc/ssh/sshd_config && \ +sed -i 's/# Port 22/ Port 3122/g' /etc/ssh/ssh_config && \ +sed -i 's/3022/3122/g' ~/.bashrc && \ +echo "Host *" >> ~/.ssh/config && \ +echo "ForwardAgent yes" >> ~/.ssh/config && \ +echo "StrictHostKeyChecking no" >> ~/.ssh/config && \ +echo "UserKnownHostsFile /dev/null" >> ~/.ssh/config && \ +echo "LogLevel ERROR" >> ~/.ssh/config && \ +chmod 600 ~/.ssh/config + + + + + diff --git a/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L1.yaml b/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L1.yaml new file mode 100644 index 0000000..c7b7f97 --- /dev/null +++ b/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L1.yaml @@ -0,0 +1,28 @@ +version: '3.3' + +services: + hhs_level1: + image: hhs_level1 + build: + context: . + network: host + args: + BASE_IMAGE: "${BASE_IMAGE}" + container_name: hhs_level1 + runtime: habana + environment: + - HABANA_VISIBLE_DEVICES=all + - OMPI_MCA_btl_vader_single_copy_mechanism=none + - HHS_LEVEL=1 + cap_add: + - SYS_NICE + ipc: host + network_mode: host + working_dir: /tmp/hhs/habana_health_screen + volumes: + - ./ssh:/root/.ssh/ + - /tmp/hhs/habana_health_screen:/tmp/hhs/habana_health_screen + - /etc/localtime:/etc/localtime:ro + command: > + bash -c "python screen.py --hhs-check node --logs-dir $${LOG_DIR} --node-name $${MY_NODE_NAME} --node-write-report && \ + chmod 777 -R $${LOG_DIR}" diff --git a/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-launcher.yaml b/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-launcher.yaml new file mode 100644 index 0000000..b19c303 --- /dev/null +++ b/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-launcher.yaml @@ -0,0 +1,27 @@ +version: '3.3' + +services: + hhs_level2_launcher: + image: hhs_level2 + build: + context: . + network: host + args: + BASE_IMAGE: "${BASE_IMAGE}" + container_name: hhs_level2_launcher + runtime: habana + environment: + - HABANA_VISIBLE_DEVICES=all + - OMPI_MCA_btl_vader_single_copy_mechanism=none + - HHS_LEVEL=2 + cap_add: + - SYS_NICE + ipc: host + network_mode: host + working_dir: /tmp/hhs/habana_health_screen + volumes: + - ./ssh:/root/.ssh/ + - /tmp/hhs/habana_health_screen:/tmp/hhs/habana_health_screen + - /etc/localtime:/etc/localtime:ro + command: > + template/bare-metal/run_hccl_demo.sh \ No newline at end of file diff --git a/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-worker.yaml b/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-worker.yaml new file mode 100644 index 0000000..a8f6c6a --- /dev/null +++ b/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-worker.yaml @@ -0,0 +1,26 @@ +version: '3.3' + +services: + hhs_level2_worker: + image: hhs_level2 + build: + context: . + network: host + args: + BASE_IMAGE: "${BASE_IMAGE}" + container_name: hhs_level2_worker + runtime: habana + environment: + - HABANA_VISIBLE_DEVICES=all + - OMPI_MCA_btl_vader_single_copy_mechanism=none + - HHS_LEVEL=2 + cap_add: + - SYS_NICE + ipc: host + network_mode: host + working_dir: /tmp/hhs/habana_health_screen + volumes: + - ./ssh:/root/.ssh/ + - /tmp/hhs/habana_health_screen:/tmp/hhs/habana_health_screen + - /etc/localtime:/etc/localtime:ro + tty: true diff --git a/utils/habana_health_screen/template/bare-metal/run_hccl_demo.sh b/utils/habana_health_screen/template/bare-metal/run_hccl_demo.sh new file mode 100644 index 0000000..b772ebf --- /dev/null +++ b/utils/habana_health_screen/template/bare-metal/run_hccl_demo.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +NUM_NODES="${NUM_NODES:-1}"; +HOME_DIR="${HOME_DIR:-/tmp/hhs/habana_health_screen}"; +WORK_DIR="${WORK_DIR:-/tmp/hhs/habana_health_screen/build/hccl_demo}"; + +NGPU_PER_NODE=8; +N_CARDS=$((NUM_NODES*NGPU_PER_NODE)); + +cd ${WORK_DIR}; +CMD="python ${WORK_DIR}/run_hccl_demo.py \ +--test all_reduce \ +--loop 1000 \ +--size 32m \ +-mpi "; + +mkdir -p $HOME_DIR/$LOG_DIR/L2/$ROUND/; +cat /dev/null > $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; +touch $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; +echo "Target Nodes: $TARGET_NODES" >> $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; + +$CMD \ +-np ${N_CARDS} \ +--allow-run-as-root \ +--bind-to core \ +--map-by ppr:4:socket:PE=6 \ +--rank-by core --report-bindings \ +--tag-output \ +--merge-stderr-to-stdout --prefix $MPI_ROOT \ +-H ${TARGET_NODES//,/:48,}:48 \ +--mca btl_tcp_if_include $TCP_INTERFACE \ +-x MASTER_ADDR \ +-x PYTHONPATH="/usr/lib/habanalabs/:$PYTHONPATH" \ +-x ENABLE_CONSOLE="true" -x LOG_LEVEL_ALL=4 \ +2>&1 | tee -a $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; + +cd ${HOME_DIR}; +python $HOME_DIR/screen.py --hhs-check hccl-demo --logs-dir $LOG_DIR --job-id $JOB_ID --target-nodes $TARGET_NODES --node-name $MY_NODE_NAME; + +chmod 777 -R $HOME_DIR/$LOG_DIR diff --git a/utils/habana_health_screen/template/k8s/pt-habana-health-screen-L1.yaml b/utils/habana_health_screen/template/k8s/pt-habana-health-screen-L1.yaml new file mode 100644 index 0000000..3bc647d --- /dev/null +++ b/utils/habana_health_screen/template/k8s/pt-habana-health-screen-L1.yaml @@ -0,0 +1,67 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: template-metadata-name + namespace: default + labels: + app: hhs +spec: + template: + metadata: + labels: + app: hhs + spec: + restartPolicy: "Never" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - HHS-DUMMY-VAL + volumes: + - name: mydir + hostPath: + path: template-volume-mydir + type: Directory + tolerations: + - key: "" + operator: "Exists" + effect: "NoSchedule" + containers: + - name: template-container-name + image: template-container-image + workingDir: /habana_health_screen + command: ["/bin/bash", "-c"] + args: + - >- + python $HOME_DIR/screen.py --hhs-check node --logs-dir $LOG_DIR --node-write-report; + volumeMounts: + - name: mydir + mountPath: /habana_health_screen + env: + - name: HOME_DIR + value: "/habana_health_screen" + - name: HHS_LEVEL + value: "1" + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: MY_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + resources: + limits: + habana.ai/gaudi: 8 + cpu: 95 + requests: + habana.ai/gaudi: 8 + cpu: 95 diff --git a/utils/habana_health_screen/template/k8s/pt-habana-health-screen-L2_hccl-demo.yaml b/utils/habana_health_screen/template/k8s/pt-habana-health-screen-L2_hccl-demo.yaml new file mode 100644 index 0000000..11d7b22 --- /dev/null +++ b/utils/habana_health_screen/template/k8s/pt-habana-health-screen-L2_hccl-demo.yaml @@ -0,0 +1,142 @@ +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: template-metadata-name + namespace: default + labels: + app: hhs-hccl +spec: + slotsPerWorker: 8 + runPolicy: + cleanPodPolicy: Running + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + metadata: + labels: + app: hhs-hccl + spec: + volumes: + - name: mydir + hostPath: + path: template-volume-mydir + type: Directory + containers: + - image: template-container-image + name: pt-hhs-launcher + workingDir: /habana_health_screen + volumeMounts: + - name: mydir + mountPath: /habana_health_screen + env: + - name: JOB_ID + valueFrom: + fieldRef: + fieldPath: metadata.labels['name'] + - name: MY_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: HOME_DIR + value: "/habana_health_screen" + - name: HHS_LEVEL + value: "2" + command: ["/bin/bash", "-c"] + args: + - >- + declare -xr HOSTSFILE=$OMPI_MCA_orte_default_hostfile; + + declare -xr NUM_NODES=$(wc -l < $HOSTSFILE); + declare -xr NGPU_PER_NODE=8; + declare -xr N_CARDS=$((NUM_NODES*NGPU_PER_NODE)); + + cd ${HOME_DIR}/build/hccl_demo; + declare -xr CMD="python ${HOME_DIR}/build/hccl_demo/run_hccl_demo.py \ + --test all_reduce \ + --loop 1000 \ + --size 32m \ + -mpi "; + + set -eo pipefail; + + mkdir -p $HOME_DIR/$LOG_DIR/L2/$ROUND/; + cat /dev/null > $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; + touch $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; + echo "Target Nodes: $TARGET_NODES" > $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; + + $CMD \ + -np ${N_CARDS} \ + --allow-run-as-root \ + --bind-to core \ + --map-by ppr:4:socket:PE=6 \ + --rank-by core --report-bindings \ + --tag-output \ + --merge-stderr-to-stdout --prefix $MPI_ROOT \ + -x PYTHONPATH="/usr/lib/habanalabs/:$PYTHONPATH" \ + -x ENABLE_CONSOLE="true" -x LOG_LEVEL_ALL=4 \ + -x MAX_TIMEOUT=60 2>&1 | tee -a $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; + + cd ${HOME_DIR}; + python ${HOME_DIR}/screen.py --hhs-check hccl-demo --target-nodes $TARGET_NODES --job-id $JOB_ID --logs-dir $LOG_DIR --round $ROUND; + + Worker: + replicas: template-num-nodes + template: + metadata: + labels: + app: hhs-hccl + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - HHS-DUMMY-VAL + volumes: + - name: mydir + hostPath: + path: template-volume-mydir + type: Directory + tolerations: + - key: "" + operator: "Exists" + effect: "NoSchedule" + - key: "" + operator: "Exists" + effect: "NoExecute" + containers: + - image: template-container-image + name: pt-hhs-worker + resources: + limits: + habana.ai/gaudi: 8 + requests: + habana.ai/gaudi: 8 + volumeMounts: + - name: mydir + mountPath: /habana_health_screen + env: + - name: HHS_LEVEL + value: "2" + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: MY_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + command: ["/bin/bash", "-c"] + args: + - >- + printenv | grep "MY" >> /etc/environment; + service ssh start; + sleep 365d; diff --git a/utils/habana_health_screen/utilities.py b/utils/habana_health_screen/utilities.py new file mode 100644 index 0000000..d8e015a --- /dev/null +++ b/utils/habana_health_screen/utilities.py @@ -0,0 +1,206 @@ +# Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os, time, sys +import subprocess, shlex +from datetime import datetime + +import logging +from logging import handlers + +_logger = logging.getLogger("habana_health_screener") + +def get_logging_level(log_level): + log_level = log_level.lower() + num_level = logging.INFO + + if log_level == "info": + num_level = logging.INFO + elif log_level == "debug": + num_level = logging.DEBUG + elif log_level == "warn": + num_level = logging.WARN + elif log_level == "error": + num_level = logging.ERROR + elif log_level == "critical": + num_level = logging.CRITICAL + + return num_level + +def create_logger(logger_name, logger_file_name, f_path="", level=logging.INFO, max_bytes=5e6, backup_count=10): + """ Creates Logger that writes to logs directory + + Args: + logger_name (str): Name of Logger File. Will be appended with logs/{current_time}/logger_name.log + level (int, optional): Logging Level. Defaults to logging.INFO. + max_bytes (int, optional): Max size of log file. Will rollover once maxed reach. Defaults to 5e6. + backup_count (int, optional): Rollover Limit. Defaults to 10. + + Returns: + logger: Logger Object used to log details to designated logger file + """ + t_logger = logging.getLogger(logger_name) + t_logger.setLevel(level) + + c_time = datetime.now() + date_format = c_time.strftime("%m-%d-%Y") + time_format = c_time.strftime("%H-%M") + + file_path = f"{f_path}/{logger_file_name}.log" if f_path != "" else f"logs/{date_format}/{date_format}_{time_format}/{logger_file_name}.log" + d_path = os.path.dirname(file_path) + print(f"d_path: {d_path} file_path: {file_path}") + + if(not os.path.exists(d_path)): + os.makedirs(d_path) + + formatter = logging.Formatter("[%(asctime)s] %(levelname)s %(message)s",datefmt='%Y-%m-%d %H:%M:%S') + handler = logging.handlers.RotatingFileHandler(file_path, maxBytes=max_bytes, backupCount=backup_count) + handler.setFormatter(formatter) + + stream_handler = logging.StreamHandler(sys.stdout) + stream_handler.setFormatter(formatter) + + t_logger.addHandler(handler) + t_logger.addHandler(stream_handler) + + return t_logger, d_path + +def run_cmd(cmd, timeout_s=1_800, verbose=False): + """ Run Command through subprocess.run() + + Args: + cmd (str): CMD to run + timeout_s (int, optional): Timeout of CMD. Defaults to 1_800. + verbose (bool, optional): Print results. Defaults to False + + Returns: + bool: Result of CMD. If it encounters any weird exceptions it will be flagged as False + """ + + cmd = shlex.split(cmd) + result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=timeout_s) + + if (verbose): + _logger.debug(f"Running cmd: {cmd}") + _logger.info(result.stdout) + + return result.stdout + +def download_repos(): + """ Download Habana's Setup_and_Install and HCCL_DEMO Repos to assist in health checks + """ + if not os.path.exists("build"): + os.makedirs("build") + + if not os.path.exists("build/Setup_and_Install"): + _logger.info(f"Downloading Setup_and_Install into build/") + cmd = "git clone https://github.com/HabanaAI/Setup_and_Install.git build/Setup_and_Install" + run_cmd(cmd) + + if not os.path.exists("build/hccl_demo"): + _logger.info(f"Downloading hccl_demo into build/") + cmd = "git clone https://github.com/HabanaAI/hccl_demo.git build/hccl_demo" + run_cmd(cmd) + + os.environ["MPI"]="1" + cmd = "make -C build/hccl_demo" + run_cmd(cmd) + +def copy_files(src, dst, to_remote=True, hosts=[], exclude={}): + """ Copies files through rsync from src to dst over the list of hosts + + Args: + src (str): Source file/directory to copy + dst (str): Destination to copy files/directory + to_remote (bool, optional): rsync to remote destination (src -> host:dst). False will rsync to local destination (h:src -> dst). Defaults to True. + hosts (list, optional): List of IP Addresses to copy to/from. Defaults to []. + exclude (dict, optional): Files/Directory to ignore. Follow rsync rules for exclusions. Defaults to {}. + """ + rsync_cmd = f"rsync -ahzgop --exclude={exclude}" + + for h in hosts: + if (to_remote): + src_path = src + dst_path = f"{h}:{dst}" + else: + src_path = f"{h}:{src}" + dst_path = dst + + _logger.debug(f"Copying {src_path} to {dst_path}") + cmd = f"{rsync_cmd} {src_path} {dst_path}" + output = run_cmd(cmd) + + +def clear_job(job): + """ Clear MPIJobs based on Job Name + + Args: + job (str): Job Name to delete + """ + _logger.info(f"Checking for existing MPIJobs {job}") + cmd = f"kubectl get mpijobs -n default {job} -o=custom-columns='NAME:.metadata.name' --no-headers" + output = run_cmd(cmd) + + if job in output: + _logger.info(f"Found MPIJobs {job}. Will delete.") + cmd = f"kubectl delete mpijobs -n default {job}" + output = run_cmd(cmd) + + cmd = f"kubectl get pods -n default --selector=training.kubeflow.org/job-name={job} -o=custom-columns='NAME:.metadata.name' --no-headers" + + max_attempt = 15 + for attempts in range(max_attempt): + output = run_cmd(cmd).strip() + + if(len(output) == 0): + break + + _logger.info(f"Attempt {attempts} Pods are still up. Will wait 10 seconds to check again") + time.sleep(10) + + +def clear_hhs_pods(job_type="jobs"): + """ Clear Pods with label=hhs,hhs-hccl + + Args: + job_type (str, optional): Type of Job to delete. Options: [jobs, mpijobs]. Defaults to "jobs". + """ + _logger.info(f"Checking for existing HHS Pods ({job_type})") + + metadata_app = "hhs" if (job_type == "jobs") else "hhs-hccl" + + cmd = f"kubectl get pods -n default -l app={metadata_app} -o=custom-columns='NAME:.metadata.name' --no-headers" + output = run_cmd(cmd).strip() + + if len(output) > 0: + _logger.info(f"Found existing HHS Pods ({job_type}). Will delete.") + + cmd = f"kubectl get {job_type} -n default -l app={metadata_app} -o=custom-columns='NAME:.metadata.name' --no-headers" + output = run_cmd(cmd).strip() + jobs = output.split() + + _logger.info(f"Deleting jobs {jobs}") + for job in jobs: + cmd = f"kubectl delete {job_type} -n default {job}" + output = run_cmd(cmd) + + cmd = f"kubectl get pods -n default -l app={metadata_app} -o=custom-columns='NAME:.metadata.name' --no-headers" + max_attempt = 15 + for attempts in range(max_attempt): + output = run_cmd(cmd).strip() + + if(len(output) == 0): + break + + _logger.info(f"Attempt {attempts}: Pods are still up. Will wait 10 seconds to check again") + time.sleep(10) + diff --git a/utils/habana_health_screen/version.txt b/utils/habana_health_screen/version.txt new file mode 100644 index 0000000..afaf360 --- /dev/null +++ b/utils/habana_health_screen/version.txt @@ -0,0 +1 @@ +1.0.0 \ No newline at end of file