diff --git a/dockerfiles/README.md b/dockerfiles/README.md index 3d10c07..4758e76 100644 --- a/dockerfiles/README.md +++ b/dockerfiles/README.md @@ -30,7 +30,6 @@ This script can be used as reference to build docker images for Gaudi. 1. Go into the folder of the image type you would like to build: * base * pytorch - * tensorflow * triton 2. Run build command to generate Docker image @@ -38,16 +37,16 @@ This script can be used as reference to build docker images for Gaudi. make build ``` Examples: - #### Build pytorch image for ubuntu22.04: + #### Build pytorch image for rhel9.2: ``` cd pytorch - make build BUILD_OS=ubuntu22.04 + make build BUILD_OS=rhel9.2 ``` - #### Build tensorflow image rhel8.6: + #### Build triton image (default OS - ubuntu22.04): ``` - cd tensorflow - make build BUILD_OS=rhel8.6 + cd triton + make build ``` 3. Build command variables diff --git a/dockerfiles/base/Dockerfile.amzn2 b/dockerfiles/base/Dockerfile.amzn2 index 70f3c07..9094ae5 100644 --- a/dockerfiles/base/Dockerfile.amzn2 +++ b/dockerfiles/base/Dockerfile.amzn2 @@ -57,19 +57,18 @@ RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/AmazonLinux2" >> /etc/yum.repos.d/habanalabs.repo && \ echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/AmazonLinux2/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo -RUN yum install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".amzn2 && \ +RUN yum makecache && \ + yum install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".amzn2 && \ yum install -y habanalabs-thunk-"$VERSION"-"$REVISION".amzn2 && \ yum install -y habanalabs-firmware-tools-"$VERSION"-"$REVISION".amzn2 && \ - yum install -y habanalabs-graph-"$VERSION"-"$REVISION".amzn2 - -RUN rpm -V habanalabs-rdma-core && rpm -V habanalabs-thunk && rpm -V habanalabs-firmware-tools && rpm -V habanalabs-graph + yum install -y habanalabs-graph-"$VERSION"-"$REVISION".amzn2 && \ + rpm -V habanalabs-rdma-core && rpm -V habanalabs-thunk && rpm -V habanalabs-firmware-tools && rpm -V habanalabs-graph && \ + rm -f /etc/yum.repos.d/habanalabs.repo && \ + yum clean all && rm -rf /var/cache/yum RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 2 && \ update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1 -RUN rm -f /etc/yum.repos.d/habanalabs.repo && \ - yum clean all && rm -rf /var/cache/yum - # SSH configuration necessary to support mpi-operator v2 RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ sed -i 's/[ #]\(.*ForwardAgent \).*/ \1yes/g' /etc/ssh/ssh_config && \ @@ -88,13 +87,12 @@ RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/o ./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \ make && make install -ENV LIBOFI_VERSION="0.0.1" -RUN wget -nv -O /tmp/v${LIBOFI_VERSION}.tar.gz https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/tags/v${LIBOFI_VERSION}.tar.gz && \ - cd /tmp/ && tar xf /tmp/v${LIBOFI_VERSION}.tar.gz && \ - cd /tmp/hccl_ofi_wrapper-${LIBOFI_VERSION} && \ +RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \ + unzip /tmp/main.zip -d /tmp && \ + cd /tmp/hccl_ofi_wrapper-main && \ make && cp -f libhccl_ofi_wrapper.so /usr/lib/habanalabs/libhccl_ofi_wrapper.so && \ cd / && \ - rm -rf /tmp/v${LIBOFI_VERSION}.tar.gz /tmp/hccl_ofi_wrapper-${LIBOFI_VERSION} + rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main RUN python3 -m pip install pip==23.3.1 setuptools==67.3.3 wheel==0.38.4 diff --git a/dockerfiles/base/Dockerfile.debian10.10 b/dockerfiles/base/Dockerfile.debian10.10 index 89cf2a7..7d8222d 100644 --- a/dockerfiles/base/Dockerfile.debian10.10 +++ b/dockerfiles/base/Dockerfile.debian10.10 @@ -75,23 +75,12 @@ ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH ENV PATH=${LIBFABRIC_ROOT}/bin:$PATH -# ucx installation -ENV UCX_VERSION="1.15.0" -ENV UCX_ROOT="/opt/habanalabs/ucx-${UCX_VERSION}" -RUN wget -nv -O /tmp/ucx-${UCX_VERSION}.tar.gz https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/ucx-${UCX_VERSION}.tar.gz && \ - cd /tmp/ && tar xf /tmp/ucx-${UCX_VERSION}.tar.gz && \ - cd /tmp/ucx-${UCX_VERSION} && \ - ./configure --prefix=${UCX_ROOT} && \ - make && make install -ENV LD_LIBRARY_PATH=${UCX_ROOT}/lib:$LD_LIBRARY_PATH -ENV PATH=${UCX_ROOT}/bin:$PATH - # install openmpi ENV MPI_ROOT="/usr/local/share/openmpi" RUN wget -nv -P /tmp/openmpi "https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VER}.tar.gz" && \ cd /tmp/openmpi && tar xzf openmpi-${OPENMPI_VER}.tar.gz && \ cd /tmp/openmpi/openmpi-${OPENMPI_VER} && \ - ./configure --prefix=${MPI_ROOT} --with-sge --disable-builtin-atomics --enable-orterun-prefix-by-default --with-ucx=${UCX_ROOT} --with-verbs && \ + ./configure --prefix=${MPI_ROOT} --with-sge --disable-builtin-atomics --enable-orterun-prefix-by-default --with-verbs && \ make -j 8 && \ sudo make install && \ cp LICENSE ${MPI_ROOT} && \ @@ -149,13 +138,12 @@ RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/o ./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \ make && make install -ENV LIBOFI_VERSION="0.0.1" -RUN wget -nv -O /tmp/v${LIBOFI_VERSION}.tar.gz https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/tags/v${LIBOFI_VERSION}.tar.gz && \ - cd /tmp/ && tar xf /tmp/v${LIBOFI_VERSION}.tar.gz && \ - cd /tmp/hccl_ofi_wrapper-${LIBOFI_VERSION} && \ +RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \ + unzip /tmp/main.zip -d /tmp && \ + cd /tmp/hccl_ofi_wrapper-main && \ make && cp -f libhccl_ofi_wrapper.so /usr/lib/habanalabs/libhccl_ofi_wrapper.so && \ cd / && \ - rm -rf /tmp/v${LIBOFI_VERSION}.tar.gz /tmp/hccl_ofi_wrapper-${LIBOFI_VERSION} + rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main RUN update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.8 2 && \ update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1 diff --git a/dockerfiles/base/Dockerfile.rhel8.6 b/dockerfiles/base/Dockerfile.rhel9.2 similarity index 59% rename from dockerfiles/base/Dockerfile.rhel8.6 rename to dockerfiles/base/Dockerfile.rhel9.2 index e9fed84..2843aac 100644 --- a/dockerfiles/base/Dockerfile.rhel8.6 +++ b/dockerfiles/base/Dockerfile.rhel9.2 @@ -1,9 +1,9 @@ -# Copyright (c) 2023 Habana Labs, Ltd. +# Copyright (c) 2024 Habana Labs, Ltd. # # SPDX-License-Identifier: Apache-2.0 # -# HabanaLabs Dockerfile base installer layer for RedHat 8.6 -FROM registry.access.redhat.com/ubi8/ubi:8.6 +# HabanaLabs Dockerfile base installer layer for RedHat 9.2 +FROM registry.access.redhat.com/ubi9/ubi:9.2 ARG ARTIFACTORY_URL ARG VERSION ARG REVISION @@ -13,20 +13,19 @@ LABEL release="${VERSION}-${REVISION}" COPY LICENSE /licenses/ -RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm && \ +RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ dnf clean all && rm -rf /var/cache/yum -RUN echo "[appstream]" > /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ - echo "name=CentOS Linux 8 - AppStream" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ - echo "mirrorlist=http://mirrorlist.centos.org/?release=\$releasever-stream&arch=\$basearch&repo=AppStream&infra=\$infra" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ - echo "gpgcheck=0" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo - - RUN echo "[BaseOS]" > /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ - echo "name=CentOS Linux 8 - BaseOS" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ - echo "mirrorlist=http://mirrorlist.centos.org/?release=\$releasever-stream&arch=\$basearch&repo=BaseOS&infra=\$infra" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ + echo "name=CentOS Linux 9 - BaseOS" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ + echo "baseurl=https://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ echo "gpgcheck=0" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo +RUN echo "[centos9]" > /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ + echo "name=CentOS Linux 9 - AppStream" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ + echo "baseurl=https://mirror.stream.centos.org/9-stream/AppStream/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ + echo "gpgcheck=0" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo + RUN dnf install -y \ clang \ cmake3 \ @@ -42,13 +41,18 @@ RUN dnf install -y \ unzip \ llvm \ lsof \ - python38-devel \ + python3-devel \ openssh-clients \ + openssl \ + openssl-devel \ libjpeg-devel \ openssh-server \ - redhat-lsb-core \ + lsb_release \ wget \ git \ + libffi-devel \ + bzip2-devel \ + zlib-devel \ mesa-libGL \ iproute \ python3-dnf-plugin-versionlock && \ @@ -57,6 +61,10 @@ RUN dnf install -y \ dnf update -y && \ dnf clean all && rm -rf /var/cache/yum +COPY install-python310.sh . +RUN ./install-python310.sh rhel9.2 && rm install-python310.sh +ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH + COPY install_efa.sh . RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh @@ -71,20 +79,18 @@ ENV RDMAV_FORK_SAFE=1 ENV FI_EFA_USE_DEVICE_RDMA=1 RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ - echo "name=Habana RH8 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ - echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/8/8.6" >> /etc/yum.repos.d/habanalabs.repo && \ - echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/8/8.6/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo - -RUN echo "[powertools]" > /etc/yum.repos.d/powertools.repo && \ - echo "name=powertools" >> /etc/yum.repos.d/powertools.repo && \ - echo "baseurl=http://mirror.centos.org/centos/8-stream/PowerTools/x86_64/os/" >> /etc/yum.repos.d/powertools.repo && \ - echo "gpgcheck=0" >> /etc/yum.repos.d/powertools.repo - -RUN dnf install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".el8 \ - habanalabs-thunk-"$VERSION"-"$REVISION".el8 \ - habanalabs-firmware-tools-"$VERSION"-"$REVISION".el8 \ - habanalabs-graph-"$VERSION"-"$REVISION".el8 && \ - rm -f /etc/yum.repos.d/habanalabs.repo && rm -rf /tmp/* && \ + echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ + echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2" >> /etc/yum.repos.d/habanalabs.repo && \ + echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \ + echo 'gpgcheck=1' >> /etc/yum.repos.d/habanalabs.repo + +Run update-crypto-policies --set LEGACY + +RUN dnf install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".el9 \ + habanalabs-thunk-"$VERSION"-"$REVISION".el9 \ + habanalabs-firmware-tools-"$VERSION"-"$REVISION".el9 \ + habanalabs-graph-"$VERSION"-"$REVISION".el9 && \ + rm -f /etc/yum.repos.d/habanalabs.repo && rm -f /etc/yum.repos.d/habana.repo && rm -rf /tmp/* && \ dnf clean all && rm -rf /var/cache/yum RUN rpm -V habanalabs-rdma-core && rpm -V habanalabs-thunk && rpm -V habanalabs-firmware-tools && rpm -V habanalabs-graph @@ -101,21 +107,18 @@ RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/o ./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \ make && make install -ENV LIBOFI_VERSION="0.0.1" -RUN wget -nv -O /tmp/v${LIBOFI_VERSION}.tar.gz https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/tags/v${LIBOFI_VERSION}.tar.gz && \ - cd /tmp/ && tar xf /tmp/v${LIBOFI_VERSION}.tar.gz && \ - cd /tmp/hccl_ofi_wrapper-${LIBOFI_VERSION} && \ +RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \ + unzip /tmp/main.zip -d /tmp && \ + cd /tmp/hccl_ofi_wrapper-main && \ make && cp -f libhccl_ofi_wrapper.so /usr/lib/habanalabs/libhccl_ofi_wrapper.so && \ cd / && \ - rm -rf /tmp/v${LIBOFI_VERSION}.tar.gz /tmp/hccl_ofi_wrapper-${LIBOFI_VERSION} + rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main -RUN python3.8 -m pip install pip==23.3.1 setuptools==67.3.3 wheel==0.38.4 +RUN python3.10 -m pip install pip==23.3.1 setuptools==67.3.3 wheel==0.38.4 -RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 2 && \ - alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && \ - alternatives --set python3 /usr/bin/python3.8 +RUN ln -s /usr/bin/python3 /usr/bin/python -RUN python3.8 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" +RUN python3.10 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" # SSH configuration necessary to support mpi-operator v2 RUN mkdir -p /var/run/sshd && \ diff --git a/dockerfiles/base/Dockerfile.tencentos3.1 b/dockerfiles/base/Dockerfile.tencentos3.1 index 9945288..8b0684e 100644 --- a/dockerfiles/base/Dockerfile.tencentos3.1 +++ b/dockerfiles/base/Dockerfile.tencentos3.1 @@ -84,13 +84,12 @@ RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/o ./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \ make && make install -ENV LIBOFI_VERSION="0.0.1" -RUN wget -nv -O /tmp/v${LIBOFI_VERSION}.tar.gz https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/tags/v${LIBOFI_VERSION}.tar.gz && \ - cd /tmp/ && tar xf /tmp/v${LIBOFI_VERSION}.tar.gz && \ - cd /tmp/hccl_ofi_wrapper-${LIBOFI_VERSION} && \ +RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \ + unzip /tmp/main.zip -d /tmp && \ + cd /tmp/hccl_ofi_wrapper-main && \ make && cp -f libhccl_ofi_wrapper.so /usr/lib/habanalabs/libhccl_ofi_wrapper.so && \ cd / && \ - rm -rf /tmp/v${LIBOFI_VERSION}.tar.gz /tmp/hccl_ofi_wrapper-${LIBOFI_VERSION} + rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main RUN python3.8 -m pip install pip==23.3.1 setuptools==67.3.3 wheel==0.38.4 diff --git a/dockerfiles/base/Dockerfile.ubuntu22.04 b/dockerfiles/base/Dockerfile.ubuntu22.04 index 0dd3b7b..2b9cd2f 100644 --- a/dockerfiles/base/Dockerfile.ubuntu22.04 +++ b/dockerfiles/base/Dockerfile.ubuntu22.04 @@ -97,13 +97,12 @@ RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/o ./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \ make && make install -ENV LIBOFI_VERSION="0.0.1" -RUN wget -nv -O /tmp/v${LIBOFI_VERSION}.tar.gz https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/tags/v${LIBOFI_VERSION}.tar.gz && \ - cd /tmp/ && tar xf /tmp/v${LIBOFI_VERSION}.tar.gz && \ - cd /tmp/hccl_ofi_wrapper-${LIBOFI_VERSION} && \ +RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \ + unzip /tmp/main.zip -d /tmp && \ + cd /tmp/hccl_ofi_wrapper-main && \ make && cp -f libhccl_ofi_wrapper.so /usr/lib/habanalabs/libhccl_ofi_wrapper.so && \ cd / && \ - rm -rf /tmp/v${LIBOFI_VERSION}.tar.gz /tmp/hccl_ofi_wrapper-${LIBOFI_VERSION} + rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" diff --git a/dockerfiles/tensorflow/install-python310.sh b/dockerfiles/base/install-python310.sh old mode 100644 new mode 100755 similarity index 91% rename from dockerfiles/tensorflow/install-python310.sh rename to dockerfiles/base/install-python310.sh index 01549c5..3263196 --- a/dockerfiles/tensorflow/install-python310.sh +++ b/dockerfiles/base/install-python310.sh @@ -10,12 +10,12 @@ case "${_BASE_NAME}" in echo "Skip install Python3.10 from source on Ubuntu22.04" exit 0; ;; - *debian*) + *debian* | *ubuntu*) apt update apt install -y libsqlite3-dev libreadline-dev ;; *rhel*) - yum install -y sqlite-devel readline-devel + yum install -y sqlite-devel readline-devel xz-devel ;; *amzn2*) yum install -y sqlite-devel readline-devel @@ -45,11 +45,11 @@ make -j && make altinstall # post install case "${_BASE_NAME}" in - *rhel8*) - alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 3 && \ - alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 2 && \ - alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && \ + *rhel9*) + alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 2 && \ + alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 && \ alternatives --set python3 /usr/local/bin/python3.10 + export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH ;; *amzn2*) update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 3 && \ diff --git a/dockerfiles/base/install_efa.sh b/dockerfiles/base/install_efa.sh old mode 100644 new mode 100755 diff --git a/dockerfiles/common.mk b/dockerfiles/common.mk index 8d2b925..60e2b94 100644 --- a/dockerfiles/common.mk +++ b/dockerfiles/common.mk @@ -5,10 +5,9 @@ BUILD_OS ?= ubuntu22.04 BUILD_DIR ?= $(CURDIR)/dockerbuild REPO_SERVER ?= vault.habana.ai -TF_VERSION ?= 2.15.0 -PT_VERSION ?= 2.1.1 -RELEASE_VERSION ?= 1.14.0 -RELEASE_BUILD_ID ?= 493 +PT_VERSION ?= 2.2.0 +RELEASE_VERSION ?= 1.15.0 +RELEASE_BUILD_ID ?= 479 BASE_IMAGE_URL ?= base-installer-$(BUILD_OS) IMAGE_URL = $(IMAGE_NAME):$(RELEASE_VERSION)-$(RELEASE_BUILD_ID) diff --git a/dockerfiles/pytorch/Dockerfile.amzn2 b/dockerfiles/pytorch/Dockerfile.amzn2 index dab6dd2..8a5aa95 100644 --- a/dockerfiles/pytorch/Dockerfile.amzn2 +++ b/dockerfiles/pytorch/Dockerfile.amzn2 @@ -28,7 +28,9 @@ RUN yum install -y \ zlib-devel \ lapack-devel \ openblas-devel \ - numactl && \ + pdsh \ + numactl \ + libmkl-dev && \ yum clean all RUN amazon-linux-extras install epel -y diff --git a/dockerfiles/pytorch/Dockerfile.debian10.10 b/dockerfiles/pytorch/Dockerfile.debian10.10 index bc9bb4e..066a1b7 100644 --- a/dockerfiles/pytorch/Dockerfile.debian10.10 +++ b/dockerfiles/pytorch/Dockerfile.debian10.10 @@ -29,9 +29,16 @@ RUN apt update && apt install -y \ libpcre2-dev \ libselinux1-dev \ moreutils \ + pdsh \ numactl && \ apt clean +RUN apt update && apt install -y software-properties-common && \ + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 3 && \ + apt-add-repository -y non-free && apt update && \ + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1 && \ + apt install -y libmkl-dev + # Default python is pointing to 3.7 RUN bash -c "\ ln -sf /usr/lib/x86_64-linux-gnu/libtcmalloc.so.4 /lib/x86_64-linux-gnu/libtcmalloc.so.4; \ diff --git a/dockerfiles/pytorch/Dockerfile.rhel8.6 b/dockerfiles/pytorch/Dockerfile.rhel9.2 similarity index 51% rename from dockerfiles/pytorch/Dockerfile.rhel8.6 rename to dockerfiles/pytorch/Dockerfile.rhel9.2 index 5e17e30..c5b3bc3 100644 --- a/dockerfiles/pytorch/Dockerfile.rhel8.6 +++ b/dockerfiles/pytorch/Dockerfile.rhel9.2 @@ -1,8 +1,8 @@ -# Copyright (c) 2023 HabanaLabs, Ltd. +# Copyright (c) 2024 HabanaLabs, Ltd. # # SPDX-License-Identifier: Apache-2.0 # -# HabanaLabs Dockerfile PyTorch installer layer for RHEL 8.6 +# HabanaLabs Dockerfile PyTorch installer layer for RHEL 9.2 ARG BASE_NAME ARG VERSION ARG REVISION @@ -14,13 +14,18 @@ ARG BASE_NAME ARG ARTIFACTORY_URL LABEL name="PyTorch Installer" -LABEL summary="Habanalabs PyTorch installer layer for RHEL8.6" +LABEL summary="Habanalabs PyTorch installer layer for RHEL9.2" LABEL description="Image with pre installed Habanalabs packages for PyTorch" ENV LANG=en_US.UTF-8 ENV PYTHONPATH=/root:/usr/lib/habanalabs/ -RUN dnf install -y \ +RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ + echo "name=CentOS Linux 9 - CRB" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ + echo "baseurl=https://mirror.stream.centos.org/9-stream/CRB/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ + echo "gpgcheck=0" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo + +RUN dnf install --allowerasing -y \ curl \ cairo-devel \ numactl-devel \ @@ -33,11 +38,17 @@ RUN dnf install -y \ gperftools-devel && \ dnf clean all && rm -rf /var/cache/yum +RUN dnf config-manager --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo -y && \ + dnf install --allowerasing -y intel-mkl-2020.4-912 && \ + dnf clean all && rm -rf /var/cache/yum + COPY install_packages.sh . RUN ./install_packages.sh && rm -f install_packages.sh && \ /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc +# Set LD_PRELOAD after all required installations to +# avoid warnings during docker creation ENV LD_PRELOAD=/lib64/libtcmalloc.so.4 ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 diff --git a/dockerfiles/pytorch/Dockerfile.tencentos3.1 b/dockerfiles/pytorch/Dockerfile.tencentos3.1 index bc2628a..4eced4f 100644 --- a/dockerfiles/pytorch/Dockerfile.tencentos3.1 +++ b/dockerfiles/pytorch/Dockerfile.tencentos3.1 @@ -30,9 +30,14 @@ RUN dnf install -y \ lapack-devel \ openblas-devel \ numactl \ + pdsh \ gperftools-devel && \ dnf clean all && rm -rf /var/cache/yum +RUN dnf config-manager --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo -y && \ + dnf install --allowerasing -y intel-mkl-2020.4-912 && \ + dnf clean all && rm -rf /var/cache/yum + COPY install_packages.sh . RUN ./install_packages.sh && rm -f install_packages.sh && \ diff --git a/dockerfiles/pytorch/Dockerfile.ubuntu b/dockerfiles/pytorch/Dockerfile.ubuntu index a012459..759382a 100644 --- a/dockerfiles/pytorch/Dockerfile.ubuntu +++ b/dockerfiles/pytorch/Dockerfile.ubuntu @@ -31,6 +31,8 @@ RUN apt-get update && apt-get install -y \ liblapack-dev \ libopenblas-dev \ numactl \ + pdsh \ + libmkl-dev \ libgoogle-perftools-dev && \ apt-get clean && rm -rf /var/lib/apt/lists/* diff --git a/dockerfiles/pytorch/install_packages.sh b/dockerfiles/pytorch/install_packages.sh old mode 100644 new mode 100755 index 52b8058..b96294e --- a/dockerfiles/pytorch/install_packages.sh +++ b/dockerfiles/pytorch/install_packages.sh @@ -4,8 +4,8 @@ set -ex pt_package_name="pytorch_modules-v${PT_VERSION}_${VERSION}_${REVISION}.tgz" os_string="ubuntu${OS_NUMBER}" case "${BASE_NAME}" in - *rhel8*) - os_string="rhel86" + *rhel9*) + os_string="rhel92" ;; *amzn2*) os_string="amzn2" diff --git a/dockerfiles/tensorflow/Dockerfile.amzn2 b/dockerfiles/tensorflow/Dockerfile.amzn2 deleted file mode 100644 index 14dcccd..0000000 --- a/dockerfiles/tensorflow/Dockerfile.amzn2 +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (c) 2023 HabanaLabs, Ltd. -# -# SPDX-License-Identifier: Apache-2.0 -# -# HabanaLabs Dockerfile Tensorflow installer layer for Amazon Linux 2 -ARG BASE_NAME -ARG VERSION -ARG REVISION -FROM ${BASE_NAME}:${VERSION}-${REVISION} -ARG BASE_NAME -ARG VERSION -ARG REVISION -ARG TF_VERSION -ARG ARTIFACTORY_URL - -ENV TF_MODULES_RELEASE_BUILD=/usr/lib/habanalabs/ -ENV PYTHONPATH=/root:/usr/lib/habanalabs/ -ENV PATH=$PATH:/usr/local/protoc/bin:/usr/local/openssl-1.1.1w/bin - -RUN yum install -y \ - bc \ - gcc10-c++ \ - libffi \ - libffi-devel && \ - yum clean all && rm -rf /var/cache/yum - -# Install protoc in version 3.6.1 from public sources. -# Protoc installed by yum install is outdated (ver 2.5.0). -RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.6.1/protoc-3.6.1-linux-x86_64.zip && \ - unzip protoc-3.6.1-linux-x86_64.zip -d /usr/local/protoc && \ - rm -rf protoc-3.6.1-linux-x86_64.zip - -ENV LD_LIBRARY_PATH=/usr/local/openssl-1.1.1w/lib:$LD_LIBRARY_PATH -COPY install-python310.sh install-python310.sh -RUN ./install-python310.sh $BASE_NAME -ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH - -RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" && \ - python3 -m pip install tensorflow-cpu==${TF_VERSION} && \ - python3 -m pip install habana-tensorflow=="${VERSION}"."${REVISION}" && \ - python3 -m pip install --upgrade "urllib3<2" && \ - python3 -m pip install scikit-build && \ - export CC=gcc10-gcc CXX=gcc10-g++ && \ - python3 -m pip install habana-horovod=="${VERSION}"."${REVISION}" - -# For AML/CentOS/RHEL OS'es TFIO_DATAPATH have to be specified to import tensorflow_io lib correctly -ENV TFIO_DATAPATH=/usr/local/lib64/python3.8/site-packages/ - -# For AML/CentOS/RHEL ca-cert file is expected exactly under /etc/ssl/certs/ca-certificates.crt -# otherwise curl will fail during access to S3 AWS storage -RUN ln -s /etc/ssl/certs/ca-bundle.crt /etc/ssl/certs/ca-certificates.crt - -RUN echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc && \ - rm -rf /tmp/* \ No newline at end of file diff --git a/dockerfiles/tensorflow/Dockerfile.debian10.10 b/dockerfiles/tensorflow/Dockerfile.debian10.10 deleted file mode 100644 index efb95f7..0000000 --- a/dockerfiles/tensorflow/Dockerfile.debian10.10 +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) 2023 HabanaLabs, Ltd. -# -# SPDX-License-Identifier: Apache-2.0 -# -# HabanaLabs Dockerfile Tensorflow installer layer for debian -ARG BASE_NAME -ARG VERSION -ARG REVISION -FROM ${BASE_NAME}:${VERSION}-${REVISION} -ARG BASE_NAME -ARG VERSION -ARG REVISION -ARG TF_VERSION -ARG ARTIFACTORY_URL - -ENV TF_MODULES_RELEASE_BUILD=/usr/lib/habanalabs/ -ENV PYTHONPATH=/usr/lib/habanalabs/:/root -COPY install-python310.sh install-python310.sh -RUN ./install-python310.sh $BASE_NAME -ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH - -RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" && \ - python3 -m pip install tensorflow-cpu==${TF_VERSION} && \ - python3 -m pip install habana-tensorflow=="${VERSION}"."${REVISION}" && \ - python3 -m pip install habana-horovod=="${VERSION}"."${REVISION}" - -RUN echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc && \ - rm -rf /tmp/* \ No newline at end of file diff --git a/dockerfiles/tensorflow/Dockerfile.rhel8.6 b/dockerfiles/tensorflow/Dockerfile.rhel8.6 deleted file mode 100644 index afaef85..0000000 --- a/dockerfiles/tensorflow/Dockerfile.rhel8.6 +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) 2023 HabanaLabs, Ltd. -# -# SPDX-License-Identifier: Apache-2.0 -# -# HabanaLabs Dockerfile Tensorflow installer layer for RHEL 8.6 - -ARG BASE_NAME -ARG VERSION -ARG REVISION -FROM ${BASE_NAME}:${VERSION}-${REVISION} -ARG BASE_NAME -ARG VERSION -ARG REVISION -ARG TF_VERSION -ARG ARTIFACTORY_URL - -LABEL name="Tensorflow Installer (${TF_VERSION})" -LABEL summary="Habanalabs Tensorflow (${TF_VERSION}) installer layer for RHEL 8.6" -LABEL description="Image with pre installed Habanalabs packages for Tensorflow-${TF_VERSION}" - -ENV TF_MODULES_RELEASE_BUILD=/usr/lib/habanalabs/ -ENV PYTHONPATH=/root:/usr/lib/habanalabs/ - -# Install unzip to extract pre-trained weights for BERT demo -RUN dnf install -y \ - bc \ - protobuf-devel \ - libffi-devel \ - bzip2-devel && \ - dnf clean all && rm -rf /var/cache/dnf - -COPY install-python310.sh install-python310.sh -RUN ./install-python310.sh $BASE_NAME -ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH - -RUN python3.10 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" && \ - python3.10 -m pip install tensorflow-cpu==${TF_VERSION} && \ - python3.10 -m pip install habana-tensorflow=="${VERSION}"."${REVISION}" && \ - python3.10 -m pip install habana-horovod=="${VERSION}"."${REVISION}" - -# For AML/CentOS/RHEL OS'es TFIO_DATAPATH have to be specified to import tensorflow_io lib correctly -ENV TFIO_DATAPATH=/usr/local/lib64/python3.10/site-packages/ - -# For AML/CentOS/RHEL ca-cert file is expected exactly under /etc/ssl/certs/ca-certificates.crt -# otherwise curl will fail during access to S3 AWS storage -RUN ln -s /etc/ssl/certs/ca-bundle.crt /etc/ssl/certs/ca-certificates.crt - -RUN echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc && \ - dnf clean all && rm -rf /var/cache/dnf && rm -rf /tmp/* \ No newline at end of file diff --git a/dockerfiles/tensorflow/Dockerfile.ubuntu b/dockerfiles/tensorflow/Dockerfile.ubuntu deleted file mode 100644 index bfec0d0..0000000 --- a/dockerfiles/tensorflow/Dockerfile.ubuntu +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2023 HabanaLabs, Ltd. -# -# SPDX-License-Identifier: Apache-2.0 -# -# HabanaLabs Dockerfile Tensorflow installer layer for Ubuntu22.04 -ARG BASE_NAME -ARG VERSION -ARG REVISION -FROM ${BASE_NAME}:${VERSION}-${REVISION} -ARG BASE_NAME -ARG VERSION -ARG REVISION -ARG TF_VERSION -ARG ARTIFACTORY_URL - -ENV TF_MODULES_RELEASE_BUILD=/usr/lib/habanalabs/ -ENV PYTHONPATH=/usr/lib/habanalabs/:/root -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - openssl \ - libssl-dev \ - libffi-dev \ - libbz2-dev && \ - apt-get autoremove && apt-get clean - -COPY install-python310.sh install-python310.sh -RUN ./install-python310.sh $BASE_NAME -ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH - -RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" && \ - python3 -m pip install tensorflow-cpu==${TF_VERSION} && \ - python3 -m pip install habana-tensorflow=="${VERSION}"."${REVISION}" && \ - python3 -m pip install habana-horovod=="${VERSION}"."${REVISION}" && \ - echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc \ No newline at end of file diff --git a/dockerfiles/tensorflow/Makefile b/dockerfiles/tensorflow/Makefile deleted file mode 100644 index ec34fb8..0000000 --- a/dockerfiles/tensorflow/Makefile +++ /dev/null @@ -1,25 +0,0 @@ - -include ../common.mk - -IMAGE_NAME = tensorflow-installer-${BUILD_OS}-tf-cpu-$(TF_VERSION) -DOCKER_BUILD_ARGS := $(DOCKER_BUILD_ARGS) --build-arg TF_VERSION=$(TF_VERSION) - - -base: -ifneq ($(shell docker image inspect $(BASE_IMAGE_URL):$(RELEASE_VERSION)-$(RELEASE_BUILD_ID) --format="image_exists" 2>/dev/null), image_exists) - cd ../base; \ - make build; \ - cd ../tensorflow -endif - - -init: base - $(HIDE)mkdir -p $(BUILD_DIR) -ifneq (,$(findstring ubuntu,$(BUILD_OS))) - $(HIDE)cp $(CURDIR)/Dockerfile.ubuntu $(BUILD_DIR)/Dockerfile -else - $(HIDE)cp $(CURDIR)/Dockerfile.$(BUILD_OS) $(BUILD_DIR)/Dockerfile -endif - $(HIDE)cp $(CURDIR)/install-python310.sh $(BUILD_DIR)/install-python310.sh - -build: init diff --git a/dockerfiles/triton/Dockerfile b/dockerfiles/triton/Dockerfile index 0c4629a..6902eee 100644 --- a/dockerfiles/triton/Dockerfile +++ b/dockerfiles/triton/Dockerfile @@ -34,6 +34,7 @@ RUN echo "deb https://${ARTIFACTORY_URL}/artifactory/debian jammy main" | tee -a habanalabs-graph="$VERSION"-"$REVISION" && \ apt-get autoremove --yes && apt-get clean && rm -rf /var/lib/apt/lists/* && \ sed --in-place "/$ARTIFACTORY_URL/d" /etc/apt/sources.list + RUN apt-get update && apt-get install -y \ libjemalloc2 \ libcairo2-dev \ @@ -47,6 +48,7 @@ RUN apt-get update && apt-get install -y \ numactl \ libgoogle-perftools-dev && \ apt-get clean && rm -rf /var/lib/apt/lists/* + RUN python3 -m pip install pip==23.3.1 --disable-pip-version-check && \ python3 -m pip install setuptools==67.3.3 --disable-pip-version-check && \ python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --disable-pip-version-check @@ -64,5 +66,6 @@ RUN ln -s /usr/bin/python3.10 /usr/bin/python && wget --no-verbose "${PT_ARTIFAC pip uninstall -y pillow-simd && \ pip install pillow-simd==7.0.0.post3 --disable-pip-version-check && \ rm -rf /root/habanalabs pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz /tmp/* + ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libtcmalloc.so.4 ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 \ No newline at end of file diff --git a/utils/README.md b/utils/README.md index c7285ab..f9ce2a5 100644 --- a/utils/README.md +++ b/utils/README.md @@ -7,27 +7,16 @@ By installing, copying, accessing, or using the software, you agree to be legall - [Overview](#overview) - [manage_network_ifs.sh](#manage_network_ifs) - -
- ---- - -
- ## Overview Welcome to Gaudi's Util Scripts! This folder contains some Gaudi utility scripts that users can access as reference. -
- ---- - -
- ## manage_network_ifs +Moved to habanalabs-qual Example: (/opt/habanalabs/qual/gaudi2/bin/manage_network_ifs.sh). + This script can be used as reference to bring up, take down, set IPs, unset IPs and check for status of the Gaudi network interfaces. The following is the usage of the script: @@ -46,7 +35,7 @@ options: Note: Please run this script with one operation at a time ``` -## Operations +### Operations Before executing any operation, this script finds all the Habana network interfaces available on the system and stores the Habana interface information into a list. The list will be used for the operations. If no Habana network interface is found, the script will exit. @@ -86,19 +75,16 @@ sudo manage_network_ifs.sh --unset-ip ## check_habana_framework_env -This script can be used as reference to check the environment for running Tensorflow or PyTorch on Habana. +This script can be used as reference to check the environment for running PyTorch on Habana. The following is the usage of the script: ``` -usage: check_habana_framework_env.py [-h] [--cards CARDS] [--framework [{tensorflow,pytorch}]] +usage: check_habana_framework_env.py [-h] [--cards CARDS] -Check health of HPU for either TensorFlow/PyTorch +Check health of HPUs for PyTorch optional arguments: -h, --help show this help message and exit --cards CARDS Set number of cards to test (default: 1) - --framework [{tensorflow,pytorch}] - ML Framework to test (default: pytorch) - -``` +``` \ No newline at end of file diff --git a/utils/check_habana_framework_env.py b/utils/check_habana_framework_env.py index 94dedec..359aac0 100755 --- a/utils/check_habana_framework_env.py +++ b/utils/check_habana_framework_env.py @@ -15,54 +15,19 @@ import concurrent.futures def parse_arguments(): - parser = argparse.ArgumentParser(description="Check health of HPU for either TensorFlow/PyTorch") + parser = argparse.ArgumentParser(description="Check health of HPUs for PyTorch") parser.add_argument("--cards", default=1, type=int, required=False, help="Set number of cards to test (default: 1)") - parser.add_argument("--framework", - default="pytorch", - type=str, - required=False, - nargs="?", - choices=("tensorflow", "pytorch"), - help="ML Framework to test (default: tensorflow)") args = parser.parse_args() print(f"Configuration: {args}") return args -def tensorflow_test(device_id=0): - """ Checks health of HPU through running a basic - TensorFlow example on HPU - - Args: - device_id (int, optional): ID of HPU. Defaults to 0. - """ - - os.environ["HLS_MODULE_ID"] = str(device_id) - - try: - import tensorflow as tf - import habana_frameworks.tensorflow as htf - htf.load_habana_module() - except Exception as e: - print(f"Card {device_id} Failed to initialize Habana TensorFlow: {str(e)}") - raise - - try: - x = tf.constant(2) - y = x + x - - assert y.numpy() == 4, 'Sanity check failed: Wrong Add output' - assert 'hpu' in y.device.lower(), 'Sanity check failed: Operation not executed on Habana Device' - except (RuntimeError, AssertionError) as e: - print(f"Card {device_id} Failure: {e}") - raise - def pytorch_test(device_id=0): """ Checks health of HPU through running a basic PyTorch example on HPU @@ -93,23 +58,14 @@ def pytorch_test(device_id=0): if __name__ == '__main__': args = parse_arguments() - fw_test = None - - if args.framework == "tensorflow": - fw_test = tensorflow_test - elif args.framework == "pytorch": - fw_test = pytorch_test - else: - print("No valid framework chosen. Exiting") - exit(1) try: with concurrent.futures.ProcessPoolExecutor() as executor: - for device_id, res in zip(range(args.cards), executor.map(fw_test, range(args.cards))): + for device_id, res in zip(range(args.cards), executor.map(pytorch_test, range(args.cards))): print(f"Card {device_id} PASSED") except Exception as e: print(f"Failed to initialize Habana, error: {str(e)}") print(f"Check FAILED") exit(1) - print(f"Check PASSED for {args.cards} cards") + print(f"Check PASSED for {args.cards} cards") \ No newline at end of file diff --git a/utils/manage_network_ifs.sh b/utils/manage_network_ifs.sh deleted file mode 100755 index 2ef49b2..0000000 --- a/utils/manage_network_ifs.sh +++ /dev/null @@ -1,475 +0,0 @@ -#!/bin/bash -# -# Copyright (C) 2020 HabanaLabs, Ltd. -# All Rights Reserved. -# -# Unauthorized copying of this file, via any medium is strictly prohibited. -# Proprietary and confidential. -# - -readonly HABANA_DRIVER_NAME="habanalabs" -readonly HABANA_PCI_ID="0x1da3" -readonly NICS_NUM=8 -readonly IP_PREFIX="192.168.100." -EXT_PORTS="1 8 9" - -usage() -{ - echo -e "\nusage: $(basename $1) [options]\n" - - echo -e "options:\n" - echo -e " --up toggle up all interfaces" - echo -e " --down toggle down all interfaces" - echo -e " --status print status of all interfaces" - echo -e " --set-ip set IP for all internal interfaces" - echo -e " --unset-ip unset IP from all internal interfaces" - echo -e " --set-pfc set PFC (enabled=0,1,2,3)" - echo -e " --unset-pfc unset PFC (enabled=none)" - echo -e " --check-pfc dump PFC configuration" - echo -e " --gaudi2 chip-type is Gaudi 2" - echo -e " --no-ip don't change IPs on toggle up (can be used with --up only)" - echo -e " -v, --verbose print more logs" - echo -e " -h, --help print this help" -} - -habana_net_list="" -ip_addrs="" -ip_addr="" -verbose=false -up=0 -down=0 -status=0 -set_ip=0 -unset_ip=0 -set_pfc=0 -unset_pfc=0 -check_pfc=0 -no_ip=0 -op="" -num=1 -my_sudo="" -if [ $EUID -ne 0 ]; then - my_sudo="sudo" -fi - -toggle_link() -{ - local net_if=$1 - local op=$2 - local verbose=$3 - - $my_sudo ip link set $net_if $op - if [ $? -ne 0 ]; then - echo "Failed to toggle I/F '$net_if'" - exit 1 - fi - - if [ $verbose = true ]; then - echo "Network I/F '$net_if' was toggled $op" - fi -} - -build_net_ifs_global_list() -{ - local net_if - local if_info - local driver_name - - for net_if in /sys/class/net/*/ ; do - net_if=$(basename $net_if) - - # ignore loopback and virtual ethernet devices - if [ $net_if == "lo" ] || [ `echo $net_if | cut -c1-4` == "veth" ]; then - continue - fi - - # consider habanalabs NICs only - if [ -d /sys/class/net/$net_if/device/ ]; then - if [ $(cat /sys/class/net/$net_if/device/vendor) != $HABANA_PCI_ID ]; then - continue - fi - else - # ignore characters including and after '@' in interface name - net_if=`echo "$net_if" | cut -d'@' -f1` - - # ignore NICs which aren't managed by KMD - if_info=`$my_sudo ethtool -i $net_if` - if [ $? -ne 0 ]; then - echo "Failed to acquire information for the network interface '$net_if'" - continue - fi - - driver_name=`echo "$if_info" | grep 'driver' | awk '{print $2}'` - if [[ $driver_name != $HABANA_DRIVER_NAME* ]]; then - continue - fi - fi - - habana_net_list="$habana_net_list $net_if" - done - - if [ -z "$habana_net_list" ]; then - echo "Warning: no $HABANA_DRIVER_NAME network interfaces were detected" - exit 1 - fi -} - -check_flags () -{ - local sum - - if [ $status -gt 1 ] || [ $up -gt 1 ] || [ $down -gt 1 ] || [ $set_ip -gt 1 ] || [ $unset_ip -gt 1 ] || [ $set_pfc -gt 1 ] || [ $unset_pfc -gt 1 ] || [ $check_pfc -gt 1 ] || [ $no_ip -gt 1 ]; then - echo "each flag should be used once" - usage $0 - exit 1 - fi - - let "sum=$up + $down + $set_ip + $unset_ip + $set_pfc + $unset_pfc + $check_pfc + $no_ip" - if [ $status -ne 0 ] && [ $sum -ne 0 ]; then - echo "status flag can't be combined with other flags" - usage $0 - exit 1 - fi - - let "sum=$up + $down" - if [ $sum -gt 1 ]; then - echo "up and down flags can't be combined together" - usage $0 - exit 1 - fi - - let "sum=$set_ip + $unset_ip + $set_pfc + $unset_pfc + $check_pfc + $no_ip" - if [ $down -ne 0 ] && [ $sum -ne 0 ]; then - echo "down flag can't be combined with other flags" - usage $0 - exit 1 - fi - - let "sum=$set_ip + $unset_ip + $set_pfc + $unset_pfc + $check_pfc" - if [ $up -ne 0 ] && [ $sum -ne 0 ]; then - echo "up flag can be combined only with no-ip flag" - usage $0 - exit 1 - fi - - let "sum=$set_ip + $unset_ip" - if [ $sum -gt 1 ]; then - echo "set-ip and unset-ip flags can't be combined together" - usage $0 - exit 1 - fi - - let "sum=$set_pfc + $unset_pfc + $check_pfc" - if [ $sum -gt 1 ]; then - echo "PFC flags can't be combined together" - usage $0 - exit 1 - fi -} - -show_prog_bar () -{ - local progress - local done - local left - - let progress=(${1}*100/${2}*100)/100 - let done=(${progress}*4)/10 - let left=40-$done - - done=$(printf "%${done}s") - left=$(printf "%${left}s") - - printf "\r$3 : [${done// /\#}${left// /-}] ${progress}%%" -} - -sleep_with_prog_bar () -{ - local end - local i - - let end=($(wc -w <<< "$habana_net_list")/2)+10 - - for i in $(seq 1 ${end}) - do - sleep 0.1 - show_prog_bar ${i} ${end} $1 - done - - echo "" -} - -while [ -n "$1" ]; -do - case $1 in - -h | --help ) - usage $0 - exit 0 - ;; - --up ) - let up++ - ;; - --down ) - let down++ - ;; - --status ) - let status++ - ;; - --set-ip ) - let set_ip++ - ;; - --unset-ip ) - let unset_ip++ - ;; - --set-pfc ) - let set_pfc++ - ;; - --unset-pfc ) - let unset_pfc++ - ;; - --check-pfc ) - let check_pfc++ - ;; - --gaudi2 ) - EXT_PORTS="" - ;; - --no-ip ) - let no_ip++ - ;; - -v | --verbose ) - verbose=true - ;; - *) - echo "bad argument '$1'" - usage $0 - exit 1 - ;; - esac - shift -done - -check_flags - -if [ $status -eq 1 ]; then - for (( i=0; i<$NICS_NUM; i++ )); do - dev_name="accel$i" - if [ ! -d /sys/class/accel/$dev_name/ ]; then - echo "$dev_name doesn't exist" - continue - fi - - if [ "$(cat /sys/class/accel/$dev_name/device/status)" != "Operational" ]; then - echo "$dev_name is not operational" - continue - fi - - pci_addr=$(cat /sys/class/accel/$dev_name/device/pci_addr) - dev_ifs="" - dev_ifs_up="" - dev_ifs_down="" - dev_num_up=0 - dev_num_down=0 - - if [ -d /sys/bus/pci/devices/$pci_addr/net/ ]; then - for dev_if in /sys/bus/pci/devices/$pci_addr/net/*/; do - dev_ifs="$dev_ifs $dev_if" - done - else - for net_if in /sys/class/net/*/; do - net_if=$(basename $net_if) - - # ignore loopback and virtual ethernet devices - if [ $net_if == "lo" ] || [ `echo $net_if | cut -c1-4` == "veth" ]; then - continue - fi - - # ignore characters including and after '@' in interface name - net_if=`echo "$net_if" | cut -d'@' -f1` - - if_info=`$my_sudo ethtool -i $net_if` - if [ $? -ne 0 ]; then - echo "Failed to acquire information for the network interface '$net_if'" - exit 1 - fi - - # ignore interfaces of other devices - bus_info=`echo "$if_info" | grep 'bus-info' | awk '{print $2}'` - if [ $bus_info != $pci_addr ]; then - continue - fi - - dev_ifs="$dev_ifs $net_if" - done - fi - - for dev_if in $dev_ifs; do - dev_if=$(basename $dev_if) - - if [ ! -f /sys/class/net/$dev_if/dev_port ] || - [ ! -f /sys/class/net/$dev_if/operstate ]; then - echo "can't get dev_port/opersate of $dev_if" - exit 1 - fi - - dev_port=$(cat /sys/class/net/$dev_if/dev_port) - - if [ $(cat /sys/class/net/$dev_if/operstate) == "up" ]; then - let dev_num_up++ - if [ -z "$dev_ifs_up" ]; then - dev_ifs_up="$dev_port" - else - dev_ifs_up="$dev_ifs_up $dev_port" - fi - else - let dev_num_down++ - if [ -z "$dev_ifs_down" ]; then - dev_ifs_down="$dev_port" - else - dev_ifs_down="$dev_ifs_down $dev_port" - fi - fi - done - - echo "$dev_name" - - if [ -z "$dev_ifs_up" ] && [ -z "$dev_ifs_down" ]; then - echo "no interfaces were detected" - continue - fi - - # sort lists in ascending order - dev_ifs_up=$(echo $dev_ifs_up | xargs -n1 | sort -n | xargs) - dev_ifs_down=$(echo $dev_ifs_down | xargs -n1 | sort -n | xargs) - # add commas - dev_ifs_up=${dev_ifs_up//" "/", "} - dev_ifs_down=${dev_ifs_down//" "/", "} - - if [ $dev_num_up -gt 0 ]; then - echo "$dev_num_up ports up ($dev_ifs_up)" - fi - if [ $dev_num_down -gt 0 ]; then - echo "$dev_num_down ports down ($dev_ifs_down)" - fi - done -else - build_net_ifs_global_list -fi - -if [ $up -eq 1 ] || [ $down -eq 1 ]; then - if [ $up -eq 1 ]; then - op="up" - else - op="down" - fi - - sleep_with_prog_bar $op & - - for net_if in $habana_net_list; do - toggle_link $net_if $op $verbose & - done - - # wait for all the concurrent toggles to complete - wait - - if [ $verbose = true ]; then - echo -e "" - fi - - echo -e "$(wc -w <<< "$habana_net_list") $HABANA_DRIVER_NAME network interfaces were toggled $op" - - # set IPs by default when toggling up unless explicitly asked not to - if [ $up -eq 1 ] && [ $no_ip -eq 0 ]; then - let set_ip++ - if [ $verbose = true ]; then - echo -e "Setting IP for all internal interfaces" - echo -e "(run this script with '--unset-ip' to unset)" - fi - fi -fi - -if [ $set_ip -eq 1 ] || [ $unset_ip -eq 1 ]; then - for net_if in $habana_net_list; do - # skip non-external ports - dev_port=$(cat /sys/class/net/$net_if/dev_port) - echo $EXT_PORTS | grep -w -q $dev_port - if [ $? -eq 0 ]; then - continue - fi - - ip_addrs=$(ip addr show $net_if | grep "inet\b" | awk '{print $2}' | grep $IP_PREFIX) - - if [ $set_ip -eq 1 ]; then - if [ -n "$ip_addrs" ]; then - continue - fi - - ip_addr=($IP_PREFIX$num/24) - $my_sudo ip addr add $ip_addr dev $net_if - - if [ $? -eq 0 ]; then - if [ $verbose = true ]; then - echo -e "Network I/F '$net_if' set IP $ip_addr" - fi - let num++ - else - echo "Network I/F '$net_if' failed to set IP $ip_addr" - fi - elif [ $unset_ip -eq 1 ]; then - for ip_addr in $ip_addrs; do - $my_sudo ip addr del $ip_addr dev $net_if - - if [ $? -eq 0 ]; then - if [ $verbose = true ]; then - echo "Network I/F '$net_if' unset IP $ip_addr" - fi - else - echo "Network I/F '$net_if' failed to unset IP $ip_addr" - fi - done - fi - done - - if [ $verbose = true ]; then - echo -e "" - fi -fi - -if [ $set_pfc -eq 1 ] || [ $unset_pfc -eq 1 ] || [ $check_pfc -eq 1 ]; then - which lldptool > /dev/null - if [ $? -ne 0 ]; then - echo "lldptool is not installed" - exit 1 - fi - - if [ $set_pfc -eq 1 ]; then - op="set_pfc" - elif [ $unset_pfc -eq 1 ]; then - op="unset_pfc" - else - op="check_pfc" - fi - - for net_if in $habana_net_list; do - if [ $check_pfc -eq 1 ] || [ $verbose = true ]; then - echo -e "$op '$net_if'" - fi - if [ $set_pfc -eq 1 ]; then - $my_sudo lldptool -T -i $net_if -V PFC enabled=0,1,2,3 > /dev/null - elif [ $unset_pfc -eq 1 ]; then - $my_sudo lldptool -T -i $net_if -V PFC enabled=none > /dev/null - else - $my_sudo lldptool -t -i $net_if -V PFC -c enabled - fi - - if [ $? -ne 0 ]; then - echo "Error, $op '$net_if'" - exit 1 - fi - done - - if [ $verbose = true ]; then - echo -e "" - fi -fi - -exit 0