From fe01c252d52225f7daaf070b79594d3097de77c2 Mon Sep 17 00:00:00 2001 From: Omri Almog Date: Tue, 18 Jun 2024 19:01:24 -0700 Subject: [PATCH] SynapseAi 1.16.1 release * Update dockerfiles with 1.16.1 content --- README.md | 6 +- dockerfiles/base/Dockerfile.rhel8.6 | 6 +- dockerfiles/common.mk | 4 +- utils/README.md | 60 ++++----- ...ramework_env.py => check_framework_env.py} | 14 +- utils/habana_health_screen/version.txt | 1 - .../.gitignore | 0 .../HealthReport.py} | 46 +++---- .../IGNodes.py} | 76 ++++++----- .../README.md | 73 +++++----- .../config.yaml | 16 +-- .../hccl_demo_helper.py | 6 +- .../hostfile | 0 .../run_hhs.sh | 0 .../screen.py | 52 ++++---- .../system_utils.py | 126 +++++++++--------- .../template/bare-metal/dockerfile | 0 .../intel-gaudi-docker-compose-L1.yaml} | 14 +- ...tel-gaudi-docker-compose-L2-launcher.yaml} | 12 +- ...intel-gaudi-docker-compose-L2-worker.yaml} | 12 +- .../template/bare-metal/run_hccl_demo.sh | 8 +- .../k8s/intel-gaudi-health-screen-L1.yaml} | 19 +-- ...tel-gaudi-health-screen-L2_hccl-demo.yaml} | 35 +++-- .../utilities.py | 19 +-- utils/intel_gaudi_health_screen/version.txt | 1 + 25 files changed, 309 insertions(+), 297 deletions(-) rename utils/{check_habana_framework_env.py => check_framework_env.py} (82%) mode change 100755 => 100644 delete mode 100644 utils/habana_health_screen/version.txt rename utils/{habana_health_screen => intel_gaudi_health_screen}/.gitignore (100%) rename utils/{habana_health_screen/HabanaHealthReport.py => intel_gaudi_health_screen/HealthReport.py} (90%) rename utils/{habana_health_screen/HNodes.py => intel_gaudi_health_screen/IGNodes.py} (77%) rename utils/{habana_health_screen => intel_gaudi_health_screen}/README.md (85%) rename utils/{habana_health_screen => intel_gaudi_health_screen}/config.yaml (69%) rename utils/{habana_health_screen => intel_gaudi_health_screen}/hccl_demo_helper.py (97%) rename utils/{habana_health_screen => intel_gaudi_health_screen}/hostfile (100%) rename utils/{habana_health_screen => intel_gaudi_health_screen}/run_hhs.sh (100%) rename utils/{habana_health_screen => intel_gaudi_health_screen}/screen.py (79%) rename utils/{habana_health_screen => intel_gaudi_health_screen}/system_utils.py (80%) rename utils/{habana_health_screen => intel_gaudi_health_screen}/template/bare-metal/dockerfile (100%) rename utils/{habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L1.yaml => intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L1.yaml} (56%) rename utils/{habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-launcher.yaml => intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-launcher.yaml} (65%) rename utils/{habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-worker.yaml => intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-worker.yaml} (63%) rename utils/{habana_health_screen => intel_gaudi_health_screen}/template/bare-metal/run_hccl_demo.sh (70%) rename utils/{habana_health_screen/template/k8s/pt-habana-health-screen-L1.yaml => intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml} (76%) rename utils/{habana_health_screen/template/k8s/pt-habana-health-screen-L2_hccl-demo.yaml => intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml} (80%) rename utils/{habana_health_screen => intel_gaudi_health_screen}/utilities.py (91%) create mode 100644 utils/intel_gaudi_health_screen/version.txt diff --git a/README.md b/README.md index f5edf93..65b6d62 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Gaudi Setup and Installation +# Intel® Gaudi® Accelerator Setup and Installation
@@ -6,7 +6,7 @@
-By installing, copying, accessing, or using the software, you agree to be legally bound by the terms and conditions of the Habana software license agreement [defined here](https://habana.ai/habana-outbound-software-license-agreement/). +By installing, copying, accessing, or using the software, you agree to be legally bound by the terms and conditions of the Intel Gaudi software license agreement [defined here](https://habana.ai/habana-outbound-software-license-agreement/).
@@ -18,7 +18,7 @@ By installing, copying, accessing, or using the software, you agree to be legall Welcome to Setup and Installation GitHub Repository! -The full installation documentation has been consolidated into the Installation Guide in our Habana Documentation. Please reference our [Habana docs](https://docs.habana.ai/en/latest/Installation_Guide/GAUDI_Installation_Guide.html) for the full installation guide. +The full installation documentation has been consolidated into the Installation Guide in our Intel Gaudi Documentation. Please reference our [Intel Gaudi docs](https://docs.habana.ai/en/latest/Installation_Guide/GAUDI_Installation_Guide.html) for the full installation guide. This respository contains the following references: - dockerfiles -- Reference dockerfiles and build script to build Gaudi Docker images diff --git a/dockerfiles/base/Dockerfile.rhel8.6 b/dockerfiles/base/Dockerfile.rhel8.6 index aa74d90..5d93727 100644 --- a/dockerfiles/base/Dockerfile.rhel8.6 +++ b/dockerfiles/base/Dockerfile.rhel8.6 @@ -18,13 +18,13 @@ RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.n RUN echo "[appstream]" > /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ echo "name=CentOS Linux 8 - AppStream" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ - echo "mirrorlist=http://mirrorlist.centos.org/?release=\$releasever-stream&arch=\$basearch&repo=AppStream&infra=\$infra" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ + echo "baseurl=https://vault.centos.org/8-stream/AppStream/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ echo "gpgcheck=0" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo RUN echo "[BaseOS]" > /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ echo "name=CentOS Linux 8 - BaseOS" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ - echo "mirrorlist=http://mirrorlist.centos.org/?release=\$releasever-stream&arch=\$basearch&repo=BaseOS&infra=\$infra" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ + echo "baseurl=https://vault.centos.org/8-stream/BaseOS/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ echo "gpgcheck=0" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo RUN dnf install -y \ @@ -77,7 +77,7 @@ RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ RUN echo "[powertools]" > /etc/yum.repos.d/powertools.repo && \ echo "name=powertools" >> /etc/yum.repos.d/powertools.repo && \ - echo "baseurl=http://mirror.centos.org/centos/8-stream/PowerTools/x86_64/os/" >> /etc/yum.repos.d/powertools.repo && \ + echo "baseurl=https://vault.centos.org/8-stream/PowerTools/x86_64/os/" >> /etc/yum.repos.d/powertools.repo && \ echo "gpgcheck=0" >> /etc/yum.repos.d/powertools.repo RUN dnf install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".el8 \ diff --git a/dockerfiles/common.mk b/dockerfiles/common.mk index 84d8c23..81f3293 100644 --- a/dockerfiles/common.mk +++ b/dockerfiles/common.mk @@ -6,8 +6,8 @@ BUILD_DIR ?= $(CURDIR)/dockerbuild REPO_SERVER ?= vault.habana.ai PT_VERSION ?= 2.2.2 -RELEASE_VERSION ?= 1.16.0 -RELEASE_BUILD_ID ?= 526 +RELEASE_VERSION ?= 1.16.1 +RELEASE_BUILD_ID ?= 7 BASE_IMAGE_URL ?= base-installer-$(BUILD_OS) IMAGE_URL = $(IMAGE_NAME):$(RELEASE_VERSION)-$(RELEASE_BUILD_ID) diff --git a/utils/README.md b/utils/README.md index 7c29b9d..af6a8c0 100644 --- a/utils/README.md +++ b/utils/README.md @@ -1,6 +1,6 @@ # Gaudi Utils -By installing, copying, accessing, or using the software, you agree to be legally bound by the terms and conditions of the Habana software license agreement [defined here](https://habana.ai/habana-outbound-software-license-agreement/). +By installing, copying, accessing, or using the software, you agree to be legally bound by the terms and conditions of the Intel Gaudi software license agreement [defined here](https://habana.ai/habana-outbound-software-license-agreement/). ## Table of Contents @@ -14,20 +14,20 @@ By installing, copying, accessing, or using the software, you agree to be legall - [Status](#status) - [Set IP](#set-ip) - [Unset IP](#unset-ip) - - [check\_habana\_framework\_env](#check_habana_framework_env) - - [Habana Health Screen (HHS)](#habana-health-screen-hhs) + - [check\_framework\_env](#check_framework_env) + - [Intel Gaudi Health Screen (IGHS)](#intel-gaudi-health-screen-ighs) ## Overview -Welcome to Gaudi's Util Scripts! +Welcome to Intel Gaudi's Util Scripts! -This folder contains some Gaudi utility scripts that users can access as reference. +This folder contains some Intel Gaudi utility scripts that users can access as reference. ## manage_network_ifs Moved to habanalabs-qual Example: (/opt/habanalabs/qual/gaudi2/bin/manage_network_ifs.sh). -This script can be used as reference to bring up, take down, set IPs, unset IPs and check for status of the Gaudi network interfaces. +This script can be used as reference to bring up, take down, set IPs, unset IPs and check for status of the Intel Gaudi network interfaces. The following is the usage of the script: @@ -35,11 +35,11 @@ The following is the usage of the script: usage: ./manage_network_ifs.sh [options] options: - --up toggle up all Habana network interfaces - --down toggle down all Habana network interfaces - --status print status of all Habana network interfaces - --set-ip set IP for all internal Habana network interfaces - --unset-ip unset IP from all internal Habana network interfaces + --up toggle up all Intel Gaudi network interfaces + --down toggle down all Intel Gaudi network interfaces + --status print status of all Intel Gaudi network interfaces + --set-ip set IP for all internal Intel Gaudi network interfaces + --unset-ip unset IP from all internal Intel Gaudi network interfaces -v, --verbose print more logs -h, --help print this help @@ -47,67 +47,67 @@ Note: Please run this script with one operation at a time ``` ## Operations -Before executing any operation, this script finds all the Habana network interfaces available on the system and stores the Habana interface information into a list. -The list will be used for the operations. If no Habana network interface is found, the script will exit. +Before executing any operation, this script finds all the Intel Gaudi network interfaces available on the system and stores the Intel Gaudi interface information into a list. +The list will be used for the operations. If no Intel Gaudi network interface is found, the script will exit. ### Up -Use the following command to bring all Habana network interfaces online: +Use the following command to bring all Intel Gaudi network interfaces online: ``` sudo manage_network_ifs.sh --up ``` -Once all the Habana interfaces are toggled up, IPs will be set by default. Please refer [Set Ip](#set-ip) for more detail. To unset IPs, run this script with '--unset-ip' +Once all the Intel Gaudi interfaces are toggled up, IPs will be set by default. Please refer [Set Ip](#set-ip) for more detail. To unset IPs, run this script with '--unset-ip' ### Down -Use the following command to bring all Habana network interfaces offline: +Use the following command to bring all Intel Gaudi network interfaces offline: ``` sudo manage_network_ifs.sh --down ``` ### Status -Print the current operational state of all Habana network interfaces such as how many ports are up/down: +Print the current operational state of all Intel Gaudi network interfaces such as how many ports are up/down: ``` sudo manage_network_ifs.sh --status ``` ### Set IP -Use the following command to assign a default IP for all Habana network interfaces: +Use the following command to assign a default IP for all Intel Gaudi network interfaces: ``` sudo manage_network_ifs.sh --set-ip ``` Note: Default IPs are 192.168.100.1, 192.168.100.2, 192.168.100.3 and so on ### Unset IP -Remove IP from all available Habana network interfaces by the following command: +Remove IP from all available Intel Gaudi network interfaces by the following command: ``` sudo manage_network_ifs.sh --unset-ip ``` -## check_habana_framework_env +## check_framework_env -This script can be used as reference to check the environment for running PyTorch on Habana. +This script can be used as reference to check the environment for running PyTorch on Intel Gaudi. The following is the usage of the script: ``` -usage: check_habana_framework_env.py [-h] [--cards CARDS] +usage: check_framework_env.py [-h] [--cards CARDS] -Check health of HPUs for PyTorch +Check health of Intel Gaudi for PyTorch optional arguments: -h, --help show this help message and exit --cards CARDS Set number of cards to test (default: 1) ``` -## Habana Health Screen (HHS) +## Intel Gaudi Health Screen (IGHS) -**Habana Health Screen** (HHS) tool has been developed to verify the cluster network health through a suite of diagnostic tests. The test +**Intel Gaudi Health Screen** (IGHS) tool has been developed to verify the cluster network health through a suite of diagnostic tests. The test includes checking gaudi port status, running small workloads, and running standard collective operations arcoss multiple systems. ``` bash usage: screen.py [-h] [--initialize] [--screen] [--target-nodes TARGET_NODES] [--job-id JOB_ID] [--round ROUND] [--config CONFIG] - [--hhs-check [{node,hccl-demo,none}]] [--node-write-report] + [--ighs-check [{node,hccl-demo,none}]] [--node-write-report] [--node-name NODE_NAME] [--logs-dir LOGS_DIR] optional arguments: @@ -119,18 +119,18 @@ optional arguments: --job-id JOB_ID Needed to identify hccl-demo running log --round ROUND Needed to identify hccl-demo running round log --config CONFIG Configuration file for Health Screener - --hhs-check [{node,hccl-demo,none}] - Check HHS Status for Node (Ports status, Device Acquire Fail) or all_reduce + --ighs-check [{node,hccl-demo,none}] + Check IGHS Status for Node (Ports status, Device Acquire Fail, Device Temperature) or all_reduce (HCCL_DEMO between paris of nodes) --node-write-report Write Individual Node Health Report --node-name NODE_NAME Name of Node --logs-dir LOGS_DIR Output directory of health screen results ``` -To run a full HHS test, run the below command: +To run a full IGHS test, run the below command: ``` bash -# Creates HHS Report and screens clusters for any infected nodes. +# Creates IGHS Report and screens clusters for any infected nodes. # Will check Level 1 and 2 by default python screen.py --initialize --screen ``` \ No newline at end of file diff --git a/utils/check_habana_framework_env.py b/utils/check_framework_env.py old mode 100755 new mode 100644 similarity index 82% rename from utils/check_habana_framework_env.py rename to utils/check_framework_env.py index 359aac0..c12bf28 --- a/utils/check_habana_framework_env.py +++ b/utils/check_framework_env.py @@ -15,7 +15,7 @@ import concurrent.futures def parse_arguments(): - parser = argparse.ArgumentParser(description="Check health of HPUs for PyTorch") + parser = argparse.ArgumentParser(description="Check health of Intel Gaudi for PyTorch") parser.add_argument("--cards", default=1, @@ -29,11 +29,11 @@ def parse_arguments(): return args def pytorch_test(device_id=0): - """ Checks health of HPU through running a basic - PyTorch example on HPU + """ Checks health of Intel Gaudi through running a basic + PyTorch example on Intel Gaudi Args: - device_id (int, optional): ID of HPU. Defaults to 0. + device_id (int, optional): ID of Intel Gaudi. Defaults to 0. """ os.environ["ID"] = str(device_id) @@ -42,7 +42,7 @@ def pytorch_test(device_id=0): import torch import habana_frameworks.torch.core except Exception as e: - print(f"Card {device_id} Failed to initialize Habana PyTorch: {str(e)}") + print(f"Card {device_id} Failed to initialize Intel Gaudi PyTorch: {str(e)}") raise try: @@ -50,7 +50,7 @@ def pytorch_test(device_id=0): y = x + x assert y == 4, 'Sanity check failed: Wrong Add output' - assert 'hpu' in y.device.type.lower(), 'Sanity check failed: Operation not executed on Habana Device' + assert 'hpu' in y.device.type.lower(), 'Sanity check failed: Operation not executed on Intel Gaudi Card' except (RuntimeError, AssertionError) as e: print(f"Card {device_id} Failure: {e}") raise @@ -64,7 +64,7 @@ def pytorch_test(device_id=0): for device_id, res in zip(range(args.cards), executor.map(pytorch_test, range(args.cards))): print(f"Card {device_id} PASSED") except Exception as e: - print(f"Failed to initialize Habana, error: {str(e)}") + print(f"Failed to initialize on Intel Gaudi, error: {str(e)}") print(f"Check FAILED") exit(1) diff --git a/utils/habana_health_screen/version.txt b/utils/habana_health_screen/version.txt deleted file mode 100644 index afaf360..0000000 --- a/utils/habana_health_screen/version.txt +++ /dev/null @@ -1 +0,0 @@ -1.0.0 \ No newline at end of file diff --git a/utils/habana_health_screen/.gitignore b/utils/intel_gaudi_health_screen/.gitignore similarity index 100% rename from utils/habana_health_screen/.gitignore rename to utils/intel_gaudi_health_screen/.gitignore diff --git a/utils/habana_health_screen/HabanaHealthReport.py b/utils/intel_gaudi_health_screen/HealthReport.py similarity index 90% rename from utils/habana_health_screen/HabanaHealthReport.py rename to utils/intel_gaudi_health_screen/HealthReport.py index 4ac194e..b0409f1 100644 --- a/utils/habana_health_screen/HabanaHealthReport.py +++ b/utils/intel_gaudi_health_screen/HealthReport.py @@ -18,12 +18,12 @@ import logging -_logger = logging.getLogger("habana_health_screener") +_logger = logging.getLogger("health_screener") -class HabanaHealthReport(): +class HealthReport(): def __init__(self, f_dir="tmp", report_name="health_report.csv"): - """ Initialize Habana Health Report Class + """ Initialize Health Report Class Args: f_dir (str, optional): File Directory to store Health Report logs and results. Defaults to "tmp". @@ -83,8 +83,8 @@ def write_rows(self, cards=list(), node_id="", data=list(), level=1): """ Write health check results to Health Report CSV. Can write multiple rows at once Args: - cards ([HCard], optional): Level 1 HCards to report about. Defaults to list(). - node_id (str, optional): Node ID of HCards. Defaults to "". + cards ([IGCard], optional): Level 1 IGCards to report about. Defaults to list(). + node_id (str, optional): Node ID of IGCards. Defaults to "". data (_type_, optional): Health Report CSV Row data. Defaults to list(). level (int, optional): Health Screen Level. Defaults to 1. """ @@ -118,12 +118,12 @@ def update_health_report(self, detected_nodes, infected_nodes, missing_nodes): infected_nodes (list[str]): List of infected node_ids missing_nodes (list[str]): List of missing node_ids """ - tempfile = NamedTemporaryFile(mode='w', delete=False) + temp_file = NamedTemporaryFile(mode='w', delete=False) detected_nodes_cp = detected_nodes.copy() - with open(self.f_path, 'r', newline='') as csvfile, tempfile: - reader = csv.DictReader(csvfile) - writer = csv.DictWriter(tempfile, fieldnames=self.header) + with open(self.f_path, 'r', newline='') as csv_file, temp_file: + reader = csv.DictReader(csv_file) + writer = csv.DictWriter(temp_file, fieldnames=self.header) writer.writeheader() for row in reader: @@ -148,22 +148,22 @@ def update_health_report(self, detected_nodes, infected_nodes, missing_nodes): for n in missing_nodes: writer.writerow({"node_id": n, "multi_node_fail": True, "missing": True}) - shutil.move(tempfile.name, self.f_path) + shutil.move(temp_file.name, self.f_path) def update_hccl_demo_health_report(self, round, all_node_pairs, multi_node_fail, qpc_fail, missing_nodes): """ Update health_report with hccl_demo results, based on infected_nodes. Args: - all_node_pairs (list[str]): List of all node pairs reported by Level 2 round + all_node_pairs (list[str]): List of all Node Pairs reported by Level 2 round multi_node_fail (list[str]): List of Node Pairs that failed HCCL_Demo Test qpc_fail (list[str]): List of Node Pairs that failed HCCL_Demo Test due to QPC error missing_nodes (list[str]): List of Node Pairs that couldn't run HCCL_Demo """ - tempfile = NamedTemporaryFile(mode='w', delete=False) + temp_file = NamedTemporaryFile(mode='w', delete=False) - with open(self.f_path_hccl_demo, 'r', newline='') as csvfile, tempfile: - reader = csv.DictReader(csvfile) - writer = csv.DictWriter(tempfile, fieldnames=self.header_hccl_demo, extrasaction='ignore') + with open(self.f_path_hccl_demo, 'r', newline='') as csv_file, temp_file: + reader = csv.DictReader(csv_file) + writer = csv.DictWriter(temp_file, fieldnames=self.header_hccl_demo, extrasaction='ignore') writer.writeheader() for row in reader: @@ -181,7 +181,7 @@ def update_hccl_demo_health_report(self, round, all_node_pairs, multi_node_fail, if len(all_node_pairs): writer.writerows(list(all_node_pairs.values())) - shutil.move(tempfile.name, self.f_path_hccl_demo) + shutil.move(temp_file.name, self.f_path_hccl_demo) def check_screen_complete(self, num_nodes, hccl_demo=False, round=0): """ Check on status of Health Screen Check. @@ -306,11 +306,11 @@ def gather_health_report(self, level, remote_path, hosts): """ Gathers Health Report from all hosts Args: - level (str): HHS Level - remote_path (str): Remote Destintation of HHS Report - hosts (list, optional): List of IP Addresses to gather HHS Reports + level (str): IGHS Level + remote_path (str): Remote Destintation of IGHS Report + hosts (list, optional): List of IP Addresses to gather IGHS Reports """ - copy_files(src=f"{remote_path}/habana_health_screen/{self.f_dir}/L{level}", + copy_files(src=f"{remote_path}/intel_gaudi_health_screen/{self.f_dir}/L{level}", dst=f"{self.f_dir}", hosts=hosts, to_remote=False) @@ -319,7 +319,7 @@ def consolidate_health_report(self, level, report_dir): """ Consolidates the health_report_*.csv from worker pods into a single master csv file Args: - level (str): HHS Level + level (str): IGHS Level report_dir (str): Directory of CSV files to merge """ data = list() @@ -327,8 +327,8 @@ def consolidate_health_report(self, level, report_dir): csv_files = glob.glob(path) for f in csv_files: - with open(f, 'r', newline='') as csvfile: - reader = csv.DictReader(csvfile) + with open(f, 'r', newline='') as csv_file: + reader = csv.DictReader(csv_file) for row in reader: data.append(row) diff --git a/utils/habana_health_screen/HNodes.py b/utils/intel_gaudi_health_screen/IGNodes.py similarity index 77% rename from utils/habana_health_screen/HNodes.py rename to utils/intel_gaudi_health_screen/IGNodes.py index 4cf0abf..3980209 100644 --- a/utils/habana_health_screen/HNodes.py +++ b/utils/intel_gaudi_health_screen/IGNodes.py @@ -10,24 +10,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os, time, yaml, csv +import os, time, csv import logging -from multiprocessing.pool import Pool +import multiprocessing -from HabanaHealthReport import HabanaHealthReport +from HealthReport import HealthReport from utilities import run_cmd, create_logger -from hccl_demo_helper import find_groups -_logger = logging.getLogger("habana_health_screener") +_logger = logging.getLogger("health_screener") -class HNodes(): +class IGNodes(): - def __init__(self, health_report=HabanaHealthReport()): + def __init__(self, health_report=HealthReport()): """ Keeps Track of Nodes and their current states Args: - health_report (HabanaHealthReport, optional): HHS Health Report. Defaults to creating a new HabanaHealthReport(). + health_report (HealthReport, optional): IGHS Health Report. Defaults to creating a new HealthReport(). """ self.all_nodes = list() self.launcher_nodes = list() @@ -42,14 +41,13 @@ def __init__(self, health_report=HabanaHealthReport()): -class HNode(): +class IGNode(): - def __init__(self, name="", health_report=HabanaHealthReport(), num_checks_link_state=10, log_level=logging.INFO): + def __init__(self, name="", health_report=HealthReport(), num_checks_link_state=10, log_level=logging.INFO): self.name = name if name == "" and "MY_NODE_NAME" in os.environ: self.name = os.environ["MY_NODE_NAME"] - self.cards = dict() self.num_checks_link_state = num_checks_link_state @@ -77,21 +75,31 @@ def scan_cards(self): memory_used = int(row[3].split()[0]) temperature_C = int(row[4].split()[0]) - card = HCard(index=i, module_id=module_id, pci_address=pci_address, memory_used=memory_used, temperature=temperature_C, logger=self.logger) + card = IGCard(index=i, module_id=module_id, pci_address=pci_address, memory_used=memory_used, temperature=temperature_C, logger=self.logger) self.cards[i] = card self.cards = dict(sorted(self.cards.items())) def health_check(self, target_cards=[], write_report=False): - checked_cards = list() + checked_cards = list() + processes = list() + card_queue = multiprocessing.Queue() if len(target_cards) == 0: target_cards = self.cards.keys() for i in target_cards: card = self.cards[str(i)] - card.check_health(num_checks_link_state=self.num_checks_link_state) + p = multiprocessing.Process(target=card.check_health, args=(self.num_checks_link_state,card_queue)) + + p.start() + processes.append((card,p)) + + for card,p in processes: + p.join() + card_queue.put(None) + for card in iter(card_queue.get, None): checked_cards.append(card) self.logger.info(card) @@ -99,8 +107,7 @@ def health_check(self, target_cards=[], write_report=False): self.health_report.write_rows(node_id=self.name, cards=checked_cards) - -class HCard(): +class IGCard(): def __init__(self, index=-1, module_id=-1, pci_address="", memory_used=-1, framework="pytorch", temperature=-1, logger=None): self.logger = logger @@ -119,11 +126,13 @@ def __init__(self, index=-1, module_id=-1, pci_address="", memory_used=-1, frame self.external_ports = [1, 8, 9] self.incorrect_ports_direction = list() - def check_health(self,num_checks_link_state=10): + def check_health(self,num_checks_link_state=10, checked_cards=[]): self.check_link_state(attempts=num_checks_link_state, sleep_sec=0.2) self.check_device_acquire_fail() self.check_temperature_state() + checked_cards.put(self) + def check_link_state(self, attempts=10, sleep_sec=0.5): self.logger.debug(f"Checking {self.pci_address} Link State. Will check {attempts} times") cmd = f"hl-smi -n link -i {self.pci_address}" @@ -170,21 +179,26 @@ def check_port_direction(self): def check_device_acquire_fail(self): self.logger.debug(f"Checking {self.pci_address} for Device Acquire Issues") + self.device_acquire_fail = False - from build.Setup_and_Install.utils import check_habana_framework_env + os.environ["ID"] = str(self.module_id) - self.device_acquire_fail = False - fw_test = check_habana_framework_env.pytorch_test - if self.framework == "tensorflow": - fw_test = check_habana_framework_env.tensorflow_test + try: + import torch + import habana_frameworks.torch.core + except Exception as e: + self.logger.error(f"Card {self.module_id} {self.pci_address} Failed to initialize Intel Gaudi PyTorch: {str(e)}") + self.device_acquire_fail = True try: - with Pool() as pool: - result = pool.apply(fw_test, args=(self.module_id)) + x = torch.tensor([2]).to('hpu') + y = x + x + assert y == 4, 'Sanity check failed: Wrong Add output' + assert 'hpu' in y.device.type.lower(), 'Sanity check failed: Operation not executed on Habana Device' except (RuntimeError, AssertionError, Exception) as e: + self.logger.error(f"{self.pci_address} Device Acquire Failure: {e}") self.device_acquire_fail = True - self.logger.warning(f"{self.pci_address} Device Acquire Failure") return self.device_acquire_fail @@ -197,16 +211,8 @@ def check_temperature_state(self): self.temperature_state_C = "CRITICAL" elif self.temperature_C - base_temperature >= max_delta: self.temperature_state_C = "WARN" - - def check_temperature_state(self): - max_good_temperature = 83 - base_temperature = 25 - max_delta = 25 - - if self.temperature_C >= max_good_temperature: - self.temperature_state_C = "CRITICAL" - elif self.temperature_C - base_temperature >= max_delta: - self.temperature_state_C = "WARN" + else: + self.temperature_state_C = "NORMAL" def __str__(self): report_str = f""" Index: {self.index} diff --git a/utils/habana_health_screen/README.md b/utils/intel_gaudi_health_screen/README.md similarity index 85% rename from utils/habana_health_screen/README.md rename to utils/intel_gaudi_health_screen/README.md index ae23ced..a0f89d5 100644 --- a/utils/habana_health_screen/README.md +++ b/utils/intel_gaudi_health_screen/README.md @@ -1,20 +1,20 @@ -# Habana Health Screen 1.0.0 +# Intel Gaudi Health Screen 2.0.0 -A large scale Gaudi cluster contains a lot of moving parts. To ensure distributed training proceeds smoothly, it is recommended to check the +A large scale Intel Gaudi cluster contains a lot of moving parts. To ensure distributed training proceeds smoothly, it is recommended to check the cluster network health. Troubleshooting issues on a large cluster can be a tedious act. To simplify the debugging process the -**Habana Health Screen** (HHS) tool has been developed to verify the cluster network health through a suite of diagnostic tests. The test +**Intel Gaudi Health Screen** (IGHS) tool has been developed to verify the cluster network health through a suite of diagnostic tests. The test includes checking gaudi port status, running small workloads, and running standard collective operations arcoss multiple systems -HHS is capable of running on a Kubernetes cluster or on a baremetal cluster. It is an active scan, which will block other users from training -on a gaudi systems until the scans are complete. At the end of the scans, HHS produces a CSV report detailing the state of each gaudi card. +IGHS is capable of running on a Kubernetes cluster or on a baremetal cluster. It is an active scan, which will block other users from training +on a gaudi systems until the scans are complete. At the end of the scans, IGHS produces a CSV report detailing the state of each gaudi card. -It is reccomended to run HHS in the below scenarios: +It is reccomended to run IGHS in the below scenarios: * After a system upgrade/update * Before running a long term training * Pinpointing problematic systems in a cluster if a problem can't be isolated to a single system -HHS runs a multi-tiered configurable scan: +IGHS runs a multi-tiered configurable scan: * Level 1 - Individual System Diagnostics * Level 2 - Multi-System Communication Diagnostics @@ -27,10 +27,11 @@ Level 1 focuses on individual Gaudi Cards Health Diagnostics. | ------------------------- | ---------------------------------------------------------- | | Gaudi Ports Status | Checks if ports are DOWN | | Device Acquire Failures | Checks if devices are busy | +| Device Temperatue | Checks if devices temperatures are in acceptable range | **2 System Cluster Example** -Here is an example of running HHS on a 2 system cluster. It identifies the Gaudi Cards that have down links, device acquire issues, and +Here is an example of running IGHS on a 2 system cluster. It identifies the Gaudi Cards that have down links, device acquire issues, and flags for multi node communication failure | node_id | index | module_id | pci_address | temperature_C | temperature_C | device_acquire_fail | down_links | multi_node_fail | missing | @@ -73,7 +74,7 @@ first round. ** Multi Node Cluster Example** -Here is an example of running HHS for 2 rounds and the results gets recorded to `hccl_demo_health_report.csv`. It identifies node pairs that failed the all_reduce test. If "True" is flagged +Here is an example of running IGHS for 2 rounds and the results gets recorded to `hccl_demo_health_report.csv`. It identifies node pairs that failed the all_reduce test. If "True" is flagged in the multi_node_fail column, then one of the nodes has a communication issue. List of infected nodes will be printed out to the log as well as the `health_report.csv` multi_node_fail column. @@ -125,61 +126,61 @@ been tested, such as having missing cards, it is occupied by another session, or ## Setup -HHS is compatible with python3 default packages and does not require additional packages +IGHS is compatible with python3 default packages and does not require additional packages to be installed If your setup envionrment requires custom configruation, update the yaml files located in the templates folder. The default template -relies on storing HHS in a shared file system. +relies on storing IGHS in a shared file system. If running on bare metal system, then install `pdsh` to your system. Update [config.yaml](config.yaml) to match your system envionrment ``` yaml -# Sets HHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal). +# Sets IGHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal). system-info: type: "k8s" # Namespace is only required for k8s settings - namespace: "habanalabs" + namespace: "intelgaudi" # Can specify specific systems. For k8s, to scan entire cluster comment out hostfile - hostfile: "./hostfile" + # hostfile: "./hostfile" # Bare Metal Configurations ssh-path: "./ssh" tcp-interface: "10.3.124.0/24" -# Image to run Habana Health Screen +# Image to run Intel Gaudi Health Screen image: "vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest" -# Node Label used to identify a Gaudi Node -gaudi-node-label: "brightcomputing.com/node-category=gaudi" +# Node Label used to identify a Intel Gaudi Node +gaudi-node-label: "ighs_label=gaudi" # Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL) log-level: "DEBUG" -# Level 1 - Checks Individual Node Health (Ports status, Device Acquire failure) +# Level 1 - Checks Individual Node Health (Ports status, Device Acquire failure, Device Temperature) level-1: run: true - timeout_s: 300 + timeout_s: 150 # Number of times to check Port Status num-checks-link-state: 10 # Level 2 - Checks All Reduce between node pairs in the cluster. level-2: run: true - timeout_s: 180 + timeout_s: 130 # Number of times to check Network connections between nodes num-rounds: 5 ``` -To learn the features of HHS, run the below command: +To learn the features of IGHS, run the below command: ``` bash python screen.py --help usage: screen.py [-h] [--initialize] [--screen] [--target-nodes TARGET_NODES] [--job-id JOB_ID] [--round ROUND] [--config CONFIG] - [--hhs-check [{node,hccl-demo,none}]] [--node-write-report] + [--ighs-check [{node,hccl-demo,none}]] [--node-write-report] [--node-name NODE_NAME] [--logs-dir LOGS_DIR] optional arguments: @@ -191,18 +192,18 @@ optional arguments: --job-id JOB_ID Needed to identify hccl-demo running log --round ROUND Needed to identify hccl-demo running round log --config CONFIG Configuration file for Health Screener - --hhs-check [{node,hccl-demo,none}] - Check HHS Status for Node (Ports status, Device Acquire Fail) or all_reduce + --ighs-check [{node,hccl-demo,none}] + Check IGHS Status for Node (Ports status, Device Acquire Fail, Device Temperature) or all_reduce (HCCL_DEMO between paris of nodes) --node-write-report Write Individual Node Health Report --node-name NODE_NAME Name of Node --logs-dir LOGS_DIR Output directory of health screen results ``` -To Run HHS, run the below command: +To Run IGHS, run the below command: ``` bash -# Creates HHS Report and screens clusters for any infected nodes. +# Creates IGHS Report and screens clusters for any infected nodes. # Will check Level 1 and 2 by default python screen.py --initialize --screen ``` @@ -212,11 +213,11 @@ python screen.py --initialize --screen To run on bare-metal systems update the [config.yaml](config.yaml) to use bare-metal configuration. ``` yaml -# Sets HHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal). +# Sets IGHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal). system-info: type: "bare-metal" # Namespace is only required for k8s settings - namespace: "habanalabs" + namespace: "intelgaudi" # Can specify specific systems. For k8s, to scan entire cluster comment out hostfile hostfile: "./hostfile" @@ -224,26 +225,26 @@ system-info: ssh-path: "./ssh" tcp-interface: "10.3.124.0/24" -# Image to run Habana Health Screen +# Image to run Intel Gaudi Health Screen image: "vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest" -# Node Label used to identify a Gaudi Node +# Node Label used to identify a Intel Gaudi Node gaudi-node-label: "brightcomputing.com/node-category=gaudi" # Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL) log-level: "DEBUG" -# Level 1 - Checks Individual Node Health (Ports status, Device Acquire failure) +# Level 1 - Checks Individual Node Health (Ports status, Device Acquire failure, Device Temperature) level-1: run: true - timeout_s: 300 + timeout_s: 150 # Number of times to check Port Status num-checks-link-state: 10 # Level 2 - Checks All Reduce between node pairs in the cluster. level-2: run: true - timeout_s: 180 + timeout_s: 130 # Number of times to check Network connections between nodes num-rounds: 5 ``` @@ -252,9 +253,9 @@ Before running the screening test, you need to generate the ssh key used for pas ``` bash # Keys to setup initial bare-metal passwordless ssh connection between systems -ssh-keygen -t rsa -f ssh/hhs_rsa -chmod 600 ssh/hhs_rsa; -chmod 644 ssh/hhs_rsa.pub; +ssh-keygen -t rsa -f ssh/ighs_rsa +chmod 600 ssh/ighs_rsa; +chmod 644 ssh/ighs_rsa.pub; # Keys to setup containers passwordless ssh connection ssh-keygen -t rsa -f template/bare-metal/ssh/id_rsa diff --git a/utils/habana_health_screen/config.yaml b/utils/intel_gaudi_health_screen/config.yaml similarity index 69% rename from utils/habana_health_screen/config.yaml rename to utils/intel_gaudi_health_screen/config.yaml index cc984e5..34f8c88 100644 --- a/utils/habana_health_screen/config.yaml +++ b/utils/intel_gaudi_health_screen/config.yaml @@ -1,8 +1,8 @@ -# Sets HHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal). k8s does not require any system info +# Sets IGHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal). k8s does not require any system info system-info: type: "k8s" # Namespace is only required for k8s settings - namespace: "habanalabs" + namespace: "intelgaudi" # Can specify specific systems. For k8s, to scan entire cluster comment out hostfile # hostfile: "./hostfile" @@ -11,25 +11,25 @@ system-info: ssh-path: "./ssh" tcp-interface: "10.3.124.0/24" -# Image to run Habana Health Screen +# Image to run Intel Gaudi Health Screen image: "vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest" -# Node Label used to identify a Gaudi Node -gaudi-node-label: "hhs_label=gaudi" +# Node Label used to identify a Intel Gaudi Node +gaudi-node-label: "ighs_label=gaudi" # Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL) log-level: "DEBUG" -# Level 1 - Checks Individual Node Health (Ports status, Device Busy, Device Acquire failure) +# Level 1 - Checks Individual Node Health (Ports status, Device Busy, Device Acquire failure, Device Temperature) level-1: run: true - timeout_s: 300 + timeout_s: 150 # Number of times to check Port Status num-checks-link-state: 12 # Level 2 - Checks All Reduce between node pairs in the cluster. level-2: run: true - timeout_s: 100 + timeout_s: 130 # Number of times to check Network connections between nodes num-rounds: 5 \ No newline at end of file diff --git a/utils/habana_health_screen/hccl_demo_helper.py b/utils/intel_gaudi_health_screen/hccl_demo_helper.py similarity index 97% rename from utils/habana_health_screen/hccl_demo_helper.py rename to utils/intel_gaudi_health_screen/hccl_demo_helper.py index 3525ac9..7868178 100644 --- a/utils/habana_health_screen/hccl_demo_helper.py +++ b/utils/intel_gaudi_health_screen/hccl_demo_helper.py @@ -13,7 +13,7 @@ import random, math, os, yaml, glob import logging -_logger = logging.getLogger("habana_health_screener") +_logger = logging.getLogger("health_screener") def find_groups(nodes_to_test, groups_tracker): """ Find a list of node groups to run hccl_demo all reduce test @@ -116,7 +116,7 @@ def gather_hccl_logs(job_path, round, log_dir, health_report): job_path (str): Base directory of job yamls executed round (int): Round to retrieve HCCL_Demo logs log_dir (str): Base directory of HCCL_Demo logs - health_report (HabanaHealthReport): Tracks and reports health of hccl_demo + health_report (HealthReport): Tracks and reports health of hccl_demo """ path = f"{job_path}/**/r{round}/*.yaml" job_files = glob.glob(path, recursive=True) @@ -159,7 +159,7 @@ def hccl_demo_check(job_id, target_nodes, health_report, write=True): Args: job_id (str): Metadata name of the Job target_nodes ([str]): Nodes that are used in hccl_demo testing - health_report (HabanaHealthReport): Tracks and reports health of hccl_demo + health_report (HealthReport): Tracks and reports health of hccl_demo write (bool, optional): Writes to Report. Used to collect hccl results and update Base Health Report. Default to True Returns: diff --git a/utils/habana_health_screen/hostfile b/utils/intel_gaudi_health_screen/hostfile similarity index 100% rename from utils/habana_health_screen/hostfile rename to utils/intel_gaudi_health_screen/hostfile diff --git a/utils/habana_health_screen/run_hhs.sh b/utils/intel_gaudi_health_screen/run_hhs.sh similarity index 100% rename from utils/habana_health_screen/run_hhs.sh rename to utils/intel_gaudi_health_screen/run_hhs.sh diff --git a/utils/habana_health_screen/screen.py b/utils/intel_gaudi_health_screen/screen.py similarity index 79% rename from utils/habana_health_screen/screen.py rename to utils/intel_gaudi_health_screen/screen.py index 9ad8eb2..9a0b3e2 100644 --- a/utils/habana_health_screen/screen.py +++ b/utils/intel_gaudi_health_screen/screen.py @@ -10,16 +10,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os, datetime, shutil, yaml, sys +import os, datetime, yaml, sys import argparse import logging -from utilities import download_repos, clear_hhs_pods, create_logger, get_logging_level +from utilities import download_repos, clear_ighs_pods, create_logger, get_logging_level from hccl_demo_helper import hccl_demo_check from system_utils import KubeUtils, BareMetalUtils -from HabanaHealthReport import HabanaHealthReport -from HNodes import HNodes, HNode +from HealthReport import HealthReport +from IGNodes import IGNodes, IGNode _logger = None @@ -36,22 +36,22 @@ def main(args): args.logs_dir = f"logs/{date_year_format}/{date_format}/{date_format}_{time_format}" - hhs_report_name = "health_report.csv" - hhs_log_dir = args.logs_dir + ighs_report_name = "health_report.csv" + ighs_log_dir = args.logs_dir if args.node_name: - hhs_level = os.environ["HHS_LEVEL"] - hhs_report_name = f"health_report_{args.node_name}.csv" - hhs_log_dir = f"{args.logs_dir}/L{hhs_level}" + ighs_level = os.environ["IGHS_LEVEL"] + ighs_report_name = f"health_report_{args.node_name}.csv" + ighs_log_dir = f"{args.logs_dir}/L{ighs_level}" - health_report = HabanaHealthReport(f_dir=hhs_log_dir, report_name=hhs_report_name) + health_report = HealthReport(f_dir=ighs_log_dir, report_name=ighs_report_name) job_path = "tmp/jobs" with open(args.config, 'r') as f: config_data = yaml.safe_load(f) log_level = get_logging_level(config_data["log-level"]) - _logger, _ = create_logger(logger_name="habana_health_screener", logger_file_name="screener", f_path=args.logs_dir, level=log_level) + _logger, _ = create_logger(logger_name="health_screener", logger_file_name="screener", f_path=args.logs_dir, level=log_level) if config_data["system-info"]["type"] == "k8s": system_mode = KubeUtils(image=config_data["image"], @@ -82,8 +82,8 @@ def main(args): if args.screen: start_time = datetime.datetime.now() - habana_nodes = HNodes(health_report=health_report) - habana_nodes.all_nodes = system_mode.collect_nodes(gaudi_node_label=config_data["gaudi-node-label"]) + intel_gaudi_nodes = IGNodes(health_report=health_report) + intel_gaudi_nodes.all_nodes = system_mode.collect_nodes(gaudi_node_label=config_data["gaudi-node-label"]) if config_data["level-1"]["run"]: _logger.info("Running Level 1 Checks: Card Diagnostics") @@ -91,14 +91,14 @@ def main(args): os.makedirs(f"{health_report.f_dir}/L1") system_mode.initialize_node_jobs(level=1, - nodes=habana_nodes, + nodes=intel_gaudi_nodes, job_base_path=job_path) - healthy_nodes, infected_nodes, missing_nodes = system_mode.monitor_hhs_status(level=1, - nodes=habana_nodes, + healthy_nodes, infected_nodes, missing_nodes = system_mode.monitor_ighs_status(level=1, + nodes=intel_gaudi_nodes, timeout_s=config_data["level-1"]["timeout_s"]) system_mode.diagnose_unhealthy_nodes(infected_nodes, missing_nodes) - system_mode.clear_hhs_pods() + system_mode.clear_ighs_pods() if config_data["level-2"]["run"]: _logger.info("Running Level 2 Checks: Pair HCCL_DEMO All Reduce") @@ -107,16 +107,16 @@ def main(args): for i in range(config_data["level-2"]["num-rounds"]): system_mode.initialize_node_jobs(level=2, - nodes=habana_nodes, + nodes=intel_gaudi_nodes, job_base_path=job_path, round=i) - healthy_nodes, infected_nodes, missing_nodes = system_mode.monitor_hhs_status(level=2, - nodes=habana_nodes, + healthy_nodes, infected_nodes, missing_nodes = system_mode.monitor_ighs_status(level=2, + nodes=intel_gaudi_nodes, timeout_s=config_data["level-2"]["timeout_s"], round=i) system_mode.diagnose_unhealthy_nodes(infected_nodes, missing_nodes) - system_mode.clear_hhs_pods(job_type="mpijobs") + system_mode.clear_ighs_pods(job_type="mpijobs") if len(infected_nodes) == 0: _logger.info(f"Round {i}/{config_data['level-2']['num-rounds']}: No Infected Nodes found. Exit screening early.") @@ -127,13 +127,13 @@ def main(args): diff_time = (end_time - start_time) _logger.info(f"Total Run Time: {diff_time}") - if args.hhs_check == "node": - node = HNode(health_report=health_report, + if args.ighs_check == "node": + node = IGNode(health_report=health_report, num_checks_link_state=config_data["level-1"]["num-checks-link-state"], log_level=log_level) node.scan_cards() node.health_check(write_report=args.node_write_report) - elif args.hhs_check == "hccl-demo": + elif args.ighs_check == "hccl-demo": health_report.create(create_base=False, create_hccl_demo=True) target_nodes = args.target_nodes.strip("[']").replace("'","").split(',') @@ -149,8 +149,8 @@ def main(args): parser.add_argument("--job-id", type=str, default="", help="Needed to identify hccl-demo running log") parser.add_argument("--round", type=str, default="", help="Needed to identify hccl-demo running round log") parser.add_argument("--config", type=str, default="config.yaml", help="Configuration file for Health Screener") - parser.add_argument("--hhs-check", default="none", const="none", nargs="?", choices=["node", "hccl-demo", "none"], - help="Check HHS Status for Node (Ports status, Device Acquire Fail) or all_reduce (HCCL_DEMO between paris of nodes)") + parser.add_argument("--ighs-check", default="none", const="none", nargs="?", choices=["node", "hccl-demo", "none"], + help="Check IGHS Status for Node (Ports status, Device Acquire Fail, Device Temperature) or all_reduce (HCCL_DEMO between paris of nodes)") parser.add_argument("--node-write-report", action="store_true", help="Write Individual Node Health Report") parser.add_argument("--node-name", type=str, default="", help="Name of Node") diff --git a/utils/habana_health_screen/system_utils.py b/utils/intel_gaudi_health_screen/system_utils.py similarity index 80% rename from utils/habana_health_screen/system_utils.py rename to utils/intel_gaudi_health_screen/system_utils.py index 2d530c3..76363d8 100644 --- a/utils/habana_health_screen/system_utils.py +++ b/utils/intel_gaudi_health_screen/system_utils.py @@ -17,12 +17,12 @@ from hccl_demo_helper import find_groups, gather_hccl_logs -_logger = logging.getLogger("habana_health_screener") +_logger = logging.getLogger("health_screener") class SystemUtils(): - def __init__(self, image, log_dir, remote_path="/tmp/hhs"): + def __init__(self, image, log_dir, remote_path="/tmp/ighs"): self.job_path = "tmp/jobs" self.image = image self.log_dir = log_dir @@ -47,7 +47,7 @@ def extract_host(self, hostfile): return hosts - def monitor_hhs_status(self, level, nodes, timeout_s=240, round=0, monitor=True): + def monitor_ighs_status(self, level, nodes, timeout_s=240, round=0, monitor=True): is_finished = False attempt = 0 max_attempts = (timeout_s // 10) + min(timeout_s % 10, 1) @@ -58,7 +58,7 @@ def monitor_hhs_status(self, level, nodes, timeout_s=240, round=0, monitor=True) else: num_nodes = len(nodes.all_nodes) - _logger.info(f"Checking HHS Level {level} Status") + _logger.info(f"Checking IGHS Level {level} Status") if monitor: for attempt in range(max_attempts): @@ -85,7 +85,7 @@ def monitor_hhs_status(self, level, nodes, timeout_s=240, round=0, monitor=True) if len(nodes.launcher_nodes) > 0: hosts = nodes.launcher_nodes - nodes.health_report.gather_health_report(level, remote_path="/tmp/hhs", hosts=hosts) + nodes.health_report.gather_health_report(level, remote_path="/tmp/ighs", hosts=hosts) nodes.health_report.consolidate_health_report(level=level, report_dir=f"{self.log_dir}") if level == 1: @@ -127,8 +127,8 @@ def __init__(self, image, hostfile, namespace, log_dir): self.hostfile = hostfile def initialize_system(self): - self.clear_hhs_pods() - self.clear_hhs_pods(job_type="mpijobs") + self.clear_ighs_pods() + self.clear_ighs_pods(job_type="mpijobs") self.clear_jobs() def collect_nodes(self, gaudi_node_label): @@ -158,14 +158,14 @@ def initialize_node_jobs(self, level, } if level == 1: - source_f = "template/k8s/pt-habana-health-screen-L1.yaml" + source_f = "template/k8s/intel-gaudi-health-screen-L1.yaml" update_val["num-nodes"] = len(nodes.all_nodes) update_val["target-nodes"] = nodes.all_nodes node_groups = nodes.all_nodes job_path = f"{job_base_path}/L1" yaml_type = "job" elif level == 2: - source_f = "template/k8s/pt-habana-health-screen-L2_hccl-demo.yaml" + source_f = "template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml" yaml_type = "mpijob" if len(nodes.healthy_nodes) > 0: @@ -178,11 +178,11 @@ def initialize_node_jobs(self, level, for i, node_group in enumerate(node_groups): if level == 1: - update_val["metadata-name"] = f"hhs-{node_group}" + update_val["metadata-name"] = f"ighs-{node_group}" update_val["target-nodes"] = [node_group] out_file = f"{node_group}.yaml" elif level == 2: - update_val["metadata-name"] = f"hhs-hccl-r{round}-{i}" + update_val["metadata-name"] = f"ighs-hccl-r{round}-{i}" update_val["target-nodes"] = node_group update_val["num-nodes"] = len(node_group) out_file = f"{update_val['metadata-name']}.yaml" @@ -200,7 +200,7 @@ def initialize_node_jobs(self, level, def update_yaml_job(self, update_val={}, - source_file="template/k8s/pt-habana-health-screen-L1.yaml", + source_file="template/k8s/intel-gaudi-health-screen-L1.yaml", out_dir="tmp/jobs", out_file="default.yaml", yaml_type="job"): @@ -252,21 +252,21 @@ def update_yaml_job(self, update_val={}, return out_f - def clear_hhs_pods(self, job_type="jobs"): - """ Clear Pods with label=hhs,hhs-hccl + def clear_ighs_pods(self, job_type="jobs"): + """ Clear Pods with label=ighs,ighs-hccl Args: job_type (str, optional): Type of Job to delete. Options: [jobs, mpijobs]. Defaults to "jobs". """ - _logger.info(f"Checking for existing HHS Pods ({job_type})") + _logger.info(f"Checking for existing IGHS Pods ({job_type})") - metadata_app = "hhs" if (job_type == "jobs") else "hhs-hccl" + metadata_app = "ighs" if (job_type == "jobs") else "ighs-hccl" cmd = f"kubectl get pods -n {self.namespace} -l app={metadata_app} -o=custom-columns='NAME:.metadata.name' --no-headers" output = run_cmd(cmd).strip() if len(output) > 0: - _logger.info(f"Found existing HHS Pods ({job_type}). Will delete.") + _logger.info(f"Found existing IGHS Pods ({job_type}). Will delete.") cmd = f"kubectl get {job_type} -n {self.namespace} -l app={metadata_app} -o=custom-columns='NAME:.metadata.name' --no-headers" output = run_cmd(cmd).strip() @@ -338,8 +338,8 @@ def __init__(self, ssh_path, tcp_interface, log_dir, - docker_compose_f="template/pt-hhs-docker-compose-L1.yaml"): - super().__init__(image, log_dir, remote_path="/tmp/hhs") + docker_compose_f="template/intel-gaudi-docker-compose-L1.yaml"): + super().__init__(image, log_dir, remote_path="/tmp/ighs") self.hostfile = hostfile self.ssh_path = ssh_path @@ -361,20 +361,20 @@ def initialize_ssh(self): cmd = f"ssh-agent -s" output = run_cmd(cmd) - _logger.debug("Adding hhs private key to ssh-agent") - cmd = f"ssh-add {self.ssh_path}/hhs_rsa" + _logger.debug("Adding ighs private key to ssh-agent") + cmd = f"ssh-add {self.ssh_path}/ighs_rsa" output = run_cmd(cmd) def initialize_system(self): - self.clear_hhs_pods() - self.clear_hhs_pods(job_type="mpijobs") + self.clear_ighs_pods() + self.clear_ighs_pods(job_type="mpijobs") self.clear_jobs() self.clear_remote_jobs() _logger.info(f"Setting up ssh connection for hosts: {self.hosts}") for h in self.hosts: - cmd = f"ssh-copy-id -o StrictHostKeyChecking=no -i {self.ssh_path}/hhs_rsa.pub {os.environ['USER']}@{h}" + cmd = f"ssh-copy-id -o StrictHostKeyChecking=no -i {self.ssh_path}/ighs_rsa.pub {os.environ['USER']}@{h}" output = run_cmd(cmd) self.initialize_ssh() @@ -406,7 +406,7 @@ def initialize_node_jobs(self, level, job_path = f"{job_base_path}/L1" elif level == 2: if len(nodes.healthy_nodes) > 0: - nodes_to_test = [n.replace("hhs-","").replace(":48","") for n in nodes.healthy_nodes] + nodes_to_test = [n.replace("ighs-","").replace(":48","") for n in nodes.healthy_nodes] else: nodes_to_test = nodes.all_nodes.copy() @@ -426,24 +426,24 @@ def initialize_node_jobs(self, level, copy_files(src="tmp/jobs", dst=f"{self.remote_path}", hosts=update_val["target-nodes"]) copy_files(src="template/bare-metal/dockerfile", dst=f"{self.remote_path}/jobs/L1", hosts=update_val["target-nodes"]) copy_files(src="./ssh", dst=f"{self.remote_path}/jobs/L1", hosts=update_val["target-nodes"]) - copy_files(src="tmp/config.yaml", dst=f"{self.remote_path}/habana_health_screen", hosts=update_val["target-nodes"]) + copy_files(src="tmp/config.yaml", dst=f"{self.remote_path}/intel_gaudi_health_screen", hosts=update_val["target-nodes"]) elif level == 2: - update_val["metadata-name"] = f"hhs-hccl-r{round}-{i}" + update_val["metadata-name"] = f"ighs-hccl-r{round}-{i}" update_val["target-nodes"] = node_group update_val["master-node"] = node_group[0] update_val["num-nodes"] = len(node_group) - self.update_yaml_job(source_file="template/bare-metal/pt-hhs-docker-compose-L2-launcher.yaml", + self.update_yaml_job(source_file="template/bare-metal/intel-gaudi-docker-compose-L2-launcher.yaml", update_val=update_val, out_dir=job_path, - out_file=f"pt-hhs-docker-compose-L2-launcher.yaml", + out_file=f"intel-gaudi-docker-compose-L2-launcher.yaml", yaml_type="mpijob_launcher") - self.update_yaml_job(source_file="template/bare-metal/pt-hhs-docker-compose-L2-worker.yaml", + self.update_yaml_job(source_file="template/bare-metal/intel-gaudi-docker-compose-L2-worker.yaml", update_val=update_val, out_dir=job_path, - out_file=f"pt-hhs-docker-compose-L2-worker.yaml", + out_file=f"intel-gaudi-docker-compose-L2-worker.yaml", yaml_type="mpijob_worker") nodes.launcher_nodes.append(node_group[0]) nodes.worker_nodes.extend(node_group[1:]) @@ -451,13 +451,13 @@ def initialize_node_jobs(self, level, copy_files(src="tmp/jobs", dst=f"{self.remote_path}", hosts=update_val["target-nodes"]) copy_files(src="template/bare-metal/dockerfile", dst=f"{self.remote_path}/jobs/L2/r{round}", hosts=update_val["target-nodes"]) copy_files(src="template/bare-metal/ssh", dst=f"{self.remote_path}/jobs/L2/r{round}", hosts=update_val["target-nodes"]) - copy_files(src="tmp/config.yaml", dst=f"{self.remote_path}/habana_health_screen", hosts=update_val["target-nodes"]) + copy_files(src="tmp/config.yaml", dst=f"{self.remote_path}/intel_gaudi_health_screen", hosts=update_val["target-nodes"]) _logger.info(f"Launching Level {level} Jobs at {job_path}") if level == 1: - cmd = f"{self.docker_compose_cmd} -f {self.remote_path}/jobs/L1/pt-hhs-docker-compose-L1.yaml up" + cmd = f"{self.docker_compose_cmd} -f {self.remote_path}/jobs/L1/intel-gaudi-docker-compose-L1.yaml up" output = run_cmd(cmd).strip() elif level == 2: with open(f"{job_base_path}/L2/r{round}/hostfile_launchers", mode='wt', encoding='utf-8') as f: @@ -466,50 +466,50 @@ def initialize_node_jobs(self, level, f.write('\n'.join(nodes.worker_nodes)) cmd_list = [ - f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_workers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/pt-hhs-docker-compose-L2-worker.yaml build", - f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_workers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/pt-hhs-docker-compose-L2-worker.yaml up -d --remove-orphans", - f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_launchers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/pt-hhs-docker-compose-L2-launcher.yaml build", - f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_launchers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/pt-hhs-docker-compose-L2-launcher.yaml up --remove-orphans" + f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_workers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/intel-gaudi-docker-compose-L2-worker.yaml build", + f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_workers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/intel-gaudi-docker-compose-L2-worker.yaml up -d --remove-orphans", + f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_launchers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/intel-gaudi-docker-compose-L2-launcher.yaml build", + f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_launchers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/intel-gaudi-docker-compose-L2-launcher.yaml up --remove-orphans" ] for cmd in cmd_list: output = run_cmd(cmd).strip() def update_yaml_job(self, - source_file="template/bare-metal/pt-hhs-docker-compose-L1.yaml", + source_file="template/bare-metal/intel-gaudi-docker-compose-L1.yaml", out_dir="tmp/jobs", - out_file="pt-hhs-docker-compose-L1.yaml", + out_file="intel-gaudi-docker-compose-L1.yaml", update_val={}, yaml_type="job"): with open(source_file, 'r') as f: template_data = yaml.safe_load(f) if yaml_type == "job": - template_data["services"]["hhs_level1"]["build"]["args"]["BASE_IMAGE"] = self.image + template_data["services"]["ighs_level1"]["build"]["args"]["BASE_IMAGE"] = self.image - template_data["services"]["hhs_level1"]["environment"].append(f"MY_NODE_NAME={update_val['metadata-name']}") - template_data["services"]["hhs_level1"]["environment"].append(f"LOG_DIR={self.log_dir}") + template_data["services"]["ighs_level1"]["environment"].append(f"MY_NODE_NAME={update_val['metadata-name']}") + template_data["services"]["ighs_level1"]["environment"].append(f"LOG_DIR={self.log_dir}") elif yaml_type == "mpijob_launcher": - template_data["services"]["hhs_level2_launcher"]["build"]["args"]["BASE_IMAGE"] = self.image - - template_data["services"]["hhs_level2_launcher"]["environment"].append(f"MY_NODE_NAME={update_val['metadata-name']}") - template_data["services"]["hhs_level2_launcher"]["environment"].append(f"LOG_DIR={self.log_dir}") - template_data["services"]["hhs_level2_launcher"]["environment"].append(f"ROUND=r{update_val['round']}") - template_data["services"]["hhs_level2_launcher"]["environment"].append(f"NUM_NODES={update_val['num-nodes']}") - template_data["services"]["hhs_level2_launcher"]["environment"].append(f'TARGET_NODES={",".join(update_val["target-nodes"])}') - template_data["services"]["hhs_level2_launcher"]["environment"].append(f"MASTER_ADDR={update_val['master-node']}") - template_data["services"]["hhs_level2_launcher"]["environment"].append(f"TCP_INTERFACE={self.tcp_interface}") - template_data["services"]["hhs_level2_launcher"]["environment"].append(f"JOB_ID={update_val['metadata-name']}") + template_data["services"]["ighs_level2_launcher"]["build"]["args"]["BASE_IMAGE"] = self.image + + template_data["services"]["ighs_level2_launcher"]["environment"].append(f"MY_NODE_NAME={update_val['metadata-name']}") + template_data["services"]["ighs_level2_launcher"]["environment"].append(f"LOG_DIR={self.log_dir}") + template_data["services"]["ighs_level2_launcher"]["environment"].append(f"ROUND=r{update_val['round']}") + template_data["services"]["ighs_level2_launcher"]["environment"].append(f"NUM_NODES={update_val['num-nodes']}") + template_data["services"]["ighs_level2_launcher"]["environment"].append(f'TARGET_NODES={",".join(update_val["target-nodes"])}') + template_data["services"]["ighs_level2_launcher"]["environment"].append(f"MASTER_ADDR={update_val['master-node']}") + template_data["services"]["ighs_level2_launcher"]["environment"].append(f"TCP_INTERFACE={self.tcp_interface}") + template_data["services"]["ighs_level2_launcher"]["environment"].append(f"JOB_ID={update_val['metadata-name']}") elif yaml_type == "mpijob_worker": - template_data["services"]["hhs_level2_worker"]["build"]["args"]["BASE_IMAGE"] = self.image - template_data["services"]["hhs_level2_worker"]["environment"].append(f"MY_NODE_NAME={update_val['metadata-name']}") - template_data["services"]["hhs_level2_worker"]["environment"].append(f"LOG_DIR={self.log_dir}") - template_data["services"]["hhs_level2_worker"]["environment"].append(f"JOB_ID={update_val['metadata-name']}") + template_data["services"]["ighs_level2_worker"]["build"]["args"]["BASE_IMAGE"] = self.image + template_data["services"]["ighs_level2_worker"]["environment"].append(f"MY_NODE_NAME={update_val['metadata-name']}") + template_data["services"]["ighs_level2_worker"]["environment"].append(f"LOG_DIR={self.log_dir}") + template_data["services"]["ighs_level2_worker"]["environment"].append(f"JOB_ID={update_val['metadata-name']}") elif yaml_type == "config": hostfile = template_data["system-info"]["hostfile"] ssh_path = template_data["system-info"]["ssh-path"] - template_data["system-info"]["hostfile"] = f"/tmp/hhs/habana_health_screen/{os.path.basename(hostfile)}" - template_data["system-info"]["ssh-path"] = f"/tmp/hhs/habana_health_screen/{os.path.basename(ssh_path)}" + template_data["system-info"]["hostfile"] = f"/tmp/ighs/intel_gaudi_health_screen/{os.path.basename(hostfile)}" + template_data["system-info"]["ssh-path"] = f"/tmp/ighs/intel_gaudi_health_screen/{os.path.basename(ssh_path)}" out_f = f"{out_dir}/{out_file}" dir_name = os.path.dirname(out_f) @@ -521,14 +521,14 @@ def update_yaml_job(self, _logger.info(f"Created Yaml: {out_f}") - def monitor_hhs_status(self, level, nodes, timeout_s=240, round=0, monitor=True): - return super().monitor_hhs_status(level=level, nodes=nodes, timeout_s=timeout_s, round=round, monitor=False) + def monitor_ighs_status(self, level, nodes, timeout_s=240, round=0, monitor=True): + return super().monitor_ighs_status(level=level, nodes=nodes, timeout_s=timeout_s, round=round, monitor=False) - def clear_hhs_pods(self, job_type="jobs"): + def clear_ighs_pods(self, job_type="jobs"): work_dir = f"{self.remote_path}/jobs" if job_type == "jobs": - cmd = f"{self.docker_compose_cmd} -f {work_dir}/L1/pt-hhs-docker-compose-L1.yaml down" + cmd = f"{self.docker_compose_cmd} -f {work_dir}/L1/intel-gaudi-docker-compose-L1.yaml down" output = run_cmd(cmd).strip() else: files = glob.glob(f"{work_dir}/L2/**/*.yaml", recursive=True) @@ -544,7 +544,7 @@ def clear_hhs_pods(self, job_type="jobs"): output = run_cmd(cmd).strip() def clear_remote_jobs(self): - cmd = f"{self.pdsh_cmd} rm -R /tmp/hhs/jobs/" + cmd = f"{self.pdsh_cmd} rm -R /tmp/ighs/jobs/" output = run_cmd(cmd) def diagnose_unhealthy_nodes(self, infected_nodes, missing_nodes): diff --git a/utils/habana_health_screen/template/bare-metal/dockerfile b/utils/intel_gaudi_health_screen/template/bare-metal/dockerfile similarity index 100% rename from utils/habana_health_screen/template/bare-metal/dockerfile rename to utils/intel_gaudi_health_screen/template/bare-metal/dockerfile diff --git a/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L1.yaml b/utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L1.yaml similarity index 56% rename from utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L1.yaml rename to utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L1.yaml index c7b7f97..fbee7d9 100644 --- a/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L1.yaml +++ b/utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L1.yaml @@ -1,28 +1,28 @@ version: '3.3' services: - hhs_level1: - image: hhs_level1 + ighs_level1: + image: ighs_level1 build: context: . network: host args: BASE_IMAGE: "${BASE_IMAGE}" - container_name: hhs_level1 + container_name: ighs_level1 runtime: habana environment: - HABANA_VISIBLE_DEVICES=all - OMPI_MCA_btl_vader_single_copy_mechanism=none - - HHS_LEVEL=1 + - IGHS_LEVEL=1 cap_add: - SYS_NICE ipc: host network_mode: host - working_dir: /tmp/hhs/habana_health_screen + working_dir: /tmp/ighs/intel_gaudi_health_screen volumes: - ./ssh:/root/.ssh/ - - /tmp/hhs/habana_health_screen:/tmp/hhs/habana_health_screen + - /tmp/ighs/intel_gaudi_health_screen:/tmp/ighs/intel_gaudi_health_screen - /etc/localtime:/etc/localtime:ro command: > - bash -c "python screen.py --hhs-check node --logs-dir $${LOG_DIR} --node-name $${MY_NODE_NAME} --node-write-report && \ + bash -c "python screen.py --ighs-check node --logs-dir $${LOG_DIR} --node-name $${MY_NODE_NAME} --node-write-report && \ chmod 777 -R $${LOG_DIR}" diff --git a/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-launcher.yaml b/utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-launcher.yaml similarity index 65% rename from utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-launcher.yaml rename to utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-launcher.yaml index b19c303..454550e 100644 --- a/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-launcher.yaml +++ b/utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-launcher.yaml @@ -1,27 +1,27 @@ version: '3.3' services: - hhs_level2_launcher: - image: hhs_level2 + ighs_level2_launcher: + image: ighs_level2 build: context: . network: host args: BASE_IMAGE: "${BASE_IMAGE}" - container_name: hhs_level2_launcher + container_name: ighs_level2_launcher runtime: habana environment: - HABANA_VISIBLE_DEVICES=all - OMPI_MCA_btl_vader_single_copy_mechanism=none - - HHS_LEVEL=2 + - IGHS_LEVEL=2 cap_add: - SYS_NICE ipc: host network_mode: host - working_dir: /tmp/hhs/habana_health_screen + working_dir: /tmp/ighs/intel_gaudi_health_screen volumes: - ./ssh:/root/.ssh/ - - /tmp/hhs/habana_health_screen:/tmp/hhs/habana_health_screen + - /tmp/ighs/intel_gaudi_health_screen:/tmp/ighs/intel_gaudi_health_screen - /etc/localtime:/etc/localtime:ro command: > template/bare-metal/run_hccl_demo.sh \ No newline at end of file diff --git a/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-worker.yaml b/utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-worker.yaml similarity index 63% rename from utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-worker.yaml rename to utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-worker.yaml index a8f6c6a..8a99927 100644 --- a/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-worker.yaml +++ b/utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-worker.yaml @@ -1,26 +1,26 @@ version: '3.3' services: - hhs_level2_worker: - image: hhs_level2 + ighs_level2_worker: + image: ighs_level2 build: context: . network: host args: BASE_IMAGE: "${BASE_IMAGE}" - container_name: hhs_level2_worker + container_name: ighs_level2_worker runtime: habana environment: - HABANA_VISIBLE_DEVICES=all - OMPI_MCA_btl_vader_single_copy_mechanism=none - - HHS_LEVEL=2 + - IGHS_LEVEL=2 cap_add: - SYS_NICE ipc: host network_mode: host - working_dir: /tmp/hhs/habana_health_screen + working_dir: /tmp/ighs/intel_gaudi_health_screen volumes: - ./ssh:/root/.ssh/ - - /tmp/hhs/habana_health_screen:/tmp/hhs/habana_health_screen + - /tmp/ighs/intel_gaudi_health_screen:/tmp/ighs/intel_gaudi_health_screen - /etc/localtime:/etc/localtime:ro tty: true diff --git a/utils/habana_health_screen/template/bare-metal/run_hccl_demo.sh b/utils/intel_gaudi_health_screen/template/bare-metal/run_hccl_demo.sh similarity index 70% rename from utils/habana_health_screen/template/bare-metal/run_hccl_demo.sh rename to utils/intel_gaudi_health_screen/template/bare-metal/run_hccl_demo.sh index b772ebf..5d51d58 100644 --- a/utils/habana_health_screen/template/bare-metal/run_hccl_demo.sh +++ b/utils/intel_gaudi_health_screen/template/bare-metal/run_hccl_demo.sh @@ -1,8 +1,8 @@ #!/bin/bash NUM_NODES="${NUM_NODES:-1}"; -HOME_DIR="${HOME_DIR:-/tmp/hhs/habana_health_screen}"; -WORK_DIR="${WORK_DIR:-/tmp/hhs/habana_health_screen/build/hccl_demo}"; +HOME_DIR="${HOME_DIR:-/tmp/ighs/intel_gaudi_health_screen}"; +WORK_DIR="${WORK_DIR:-/tmp/ighs/intel_gaudi_health_screen/build/hccl_demo}"; NGPU_PER_NODE=8; N_CARDS=$((NUM_NODES*NGPU_PER_NODE)); @@ -32,9 +32,9 @@ $CMD \ -x MASTER_ADDR \ -x PYTHONPATH="/usr/lib/habanalabs/:$PYTHONPATH" \ -x ENABLE_CONSOLE="true" -x LOG_LEVEL_ALL=4 \ -2>&1 | tee -a $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; +2>&1 | ts '[%Y-%m-%d %H:%M:%S]' | tee -a $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; cd ${HOME_DIR}; -python $HOME_DIR/screen.py --hhs-check hccl-demo --logs-dir $LOG_DIR --job-id $JOB_ID --target-nodes $TARGET_NODES --node-name $MY_NODE_NAME; +python $HOME_DIR/screen.py --ighs-check hccl-demo --logs-dir $LOG_DIR --job-id $JOB_ID --target-nodes $TARGET_NODES --node-name $MY_NODE_NAME; chmod 777 -R $HOME_DIR/$LOG_DIR diff --git a/utils/habana_health_screen/template/k8s/pt-habana-health-screen-L1.yaml b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml similarity index 76% rename from utils/habana_health_screen/template/k8s/pt-habana-health-screen-L1.yaml rename to utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml index 3bc647d..0161d65 100644 --- a/utils/habana_health_screen/template/k8s/pt-habana-health-screen-L1.yaml +++ b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml @@ -4,12 +4,12 @@ metadata: name: template-metadata-name namespace: default labels: - app: hhs + app: ighs spec: template: metadata: labels: - app: hhs + app: ighs spec: restartPolicy: "Never" affinity: @@ -20,7 +20,7 @@ spec: - key: kubernetes.io/hostname operator: In values: - - HHS-DUMMY-VAL + - IGHS-DUMMY-VAL volumes: - name: mydir hostPath: @@ -33,18 +33,19 @@ spec: containers: - name: template-container-name image: template-container-image - workingDir: /habana_health_screen + imagePullPolicy: IfNotPresent + workingDir: /intel_gaudi_health_screen command: ["/bin/bash", "-c"] args: - >- - python $HOME_DIR/screen.py --hhs-check node --logs-dir $LOG_DIR --node-write-report; + python $HOME_DIR/screen.py --ighs-check node --logs-dir $LOG_DIR --node-write-report; volumeMounts: - name: mydir - mountPath: /habana_health_screen + mountPath: /intel_gaudi_health_screen env: - name: HOME_DIR - value: "/habana_health_screen" - - name: HHS_LEVEL + value: "/intel_gaudi_health_screen" + - name: IGHS_LEVEL value: "1" - name: MY_POD_IP valueFrom: @@ -61,7 +62,9 @@ spec: resources: limits: habana.ai/gaudi: 8 + hugepages-2Mi: 29000Mi cpu: 95 requests: habana.ai/gaudi: 8 + hugepages-2Mi: 29000Mi cpu: 95 diff --git a/utils/habana_health_screen/template/k8s/pt-habana-health-screen-L2_hccl-demo.yaml b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml similarity index 80% rename from utils/habana_health_screen/template/k8s/pt-habana-health-screen-L2_hccl-demo.yaml rename to utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml index 11d7b22..3768a1e 100644 --- a/utils/habana_health_screen/template/k8s/pt-habana-health-screen-L2_hccl-demo.yaml +++ b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml @@ -4,7 +4,7 @@ metadata: name: template-metadata-name namespace: default labels: - app: hhs-hccl + app: ighs-hccl spec: slotsPerWorker: 8 runPolicy: @@ -15,7 +15,7 @@ spec: template: metadata: labels: - app: hhs-hccl + app: ighs-hccl spec: volumes: - name: mydir @@ -24,11 +24,12 @@ spec: type: Directory containers: - image: template-container-image - name: pt-hhs-launcher - workingDir: /habana_health_screen + name: ighs-launcher + imagePullPolicy: IfNotPresent + workingDir: /intel_gaudi_health_screen volumeMounts: - name: mydir - mountPath: /habana_health_screen + mountPath: /intel_gaudi_health_screen env: - name: JOB_ID valueFrom: @@ -39,8 +40,8 @@ spec: fieldRef: fieldPath: spec.nodeName - name: HOME_DIR - value: "/habana_health_screen" - - name: HHS_LEVEL + value: "/intel_gaudi_health_screen" + - name: IGHS_LEVEL value: "2" command: ["/bin/bash", "-c"] args: @@ -73,19 +74,20 @@ spec: --rank-by core --report-bindings \ --tag-output \ --merge-stderr-to-stdout --prefix $MPI_ROOT \ + --mca btl_tcp_if_include eth0 \ -x PYTHONPATH="/usr/lib/habanalabs/:$PYTHONPATH" \ -x ENABLE_CONSOLE="true" -x LOG_LEVEL_ALL=4 \ - -x MAX_TIMEOUT=60 2>&1 | tee -a $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; + -x MAX_TIMEOUT=60 2>&1 | ts '[%Y-%m-%d %H:%M:%S]' | tee -a $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log; cd ${HOME_DIR}; - python ${HOME_DIR}/screen.py --hhs-check hccl-demo --target-nodes $TARGET_NODES --job-id $JOB_ID --logs-dir $LOG_DIR --round $ROUND; + python ${HOME_DIR}/screen.py --ighs-check hccl-demo --target-nodes $TARGET_NODES --job-id $JOB_ID --logs-dir $LOG_DIR --round $ROUND; Worker: replicas: template-num-nodes template: metadata: labels: - app: hhs-hccl + app: ighs-hccl spec: affinity: nodeAffinity: @@ -95,7 +97,7 @@ spec: - key: kubernetes.io/hostname operator: In values: - - HHS-DUMMY-VAL + - IGHS-DUMMY-VAL volumes: - name: mydir hostPath: @@ -110,17 +112,22 @@ spec: effect: "NoExecute" containers: - image: template-container-image - name: pt-hhs-worker + name: ighs-worker + imagePullPolicy: IfNotPresent resources: limits: habana.ai/gaudi: 8 + hugepages-2Mi: 29000Mi + cpu: 95 requests: habana.ai/gaudi: 8 + hugepages-2Mi: 29000Mi + cpu: 95 volumeMounts: - name: mydir - mountPath: /habana_health_screen + mountPath: /intel_gaudi_health_screen env: - - name: HHS_LEVEL + - name: IGHS_LEVEL value: "2" - name: MY_POD_IP valueFrom: diff --git a/utils/habana_health_screen/utilities.py b/utils/intel_gaudi_health_screen/utilities.py similarity index 91% rename from utils/habana_health_screen/utilities.py rename to utils/intel_gaudi_health_screen/utilities.py index d8e015a..a782d14 100644 --- a/utils/habana_health_screen/utilities.py +++ b/utils/intel_gaudi_health_screen/utilities.py @@ -17,7 +17,7 @@ import logging from logging import handlers -_logger = logging.getLogger("habana_health_screener") +_logger = logging.getLogger("health_screener") def get_logging_level(log_level): log_level = log_level.lower() @@ -96,16 +96,11 @@ def run_cmd(cmd, timeout_s=1_800, verbose=False): return result.stdout def download_repos(): - """ Download Habana's Setup_and_Install and HCCL_DEMO Repos to assist in health checks + """ Download HCCL_DEMO Repo to assist in health checks """ if not os.path.exists("build"): os.makedirs("build") - if not os.path.exists("build/Setup_and_Install"): - _logger.info(f"Downloading Setup_and_Install into build/") - cmd = "git clone https://github.com/HabanaAI/Setup_and_Install.git build/Setup_and_Install" - run_cmd(cmd) - if not os.path.exists("build/hccl_demo"): _logger.info(f"Downloading hccl_demo into build/") cmd = "git clone https://github.com/HabanaAI/hccl_demo.git build/hccl_demo" @@ -168,21 +163,21 @@ def clear_job(job): time.sleep(10) -def clear_hhs_pods(job_type="jobs"): - """ Clear Pods with label=hhs,hhs-hccl +def clear_ighs_pods(job_type="jobs"): + """ Clear Pods with label=ighs,ighs-hccl Args: job_type (str, optional): Type of Job to delete. Options: [jobs, mpijobs]. Defaults to "jobs". """ - _logger.info(f"Checking for existing HHS Pods ({job_type})") + _logger.info(f"Checking for existing IGHS Pods ({job_type})") - metadata_app = "hhs" if (job_type == "jobs") else "hhs-hccl" + metadata_app = "ighs" if (job_type == "jobs") else "ighs-hccl" cmd = f"kubectl get pods -n default -l app={metadata_app} -o=custom-columns='NAME:.metadata.name' --no-headers" output = run_cmd(cmd).strip() if len(output) > 0: - _logger.info(f"Found existing HHS Pods ({job_type}). Will delete.") + _logger.info(f"Found existing IGHS Pods ({job_type}). Will delete.") cmd = f"kubectl get {job_type} -n default -l app={metadata_app} -o=custom-columns='NAME:.metadata.name' --no-headers" output = run_cmd(cmd).strip() diff --git a/utils/intel_gaudi_health_screen/version.txt b/utils/intel_gaudi_health_screen/version.txt new file mode 100644 index 0000000..359a5b9 --- /dev/null +++ b/utils/intel_gaudi_health_screen/version.txt @@ -0,0 +1 @@ +2.0.0 \ No newline at end of file