Skip to content

Commit

Permalink
Merge pull request #56 from archlitchi/dev-vgpu-1219
Browse files Browse the repository at this point in the history
add implementation for libvgpu_so and corresponding CI
  • Loading branch information
william-wang authored Dec 26, 2023
2 parents 15b3915 + 1e0d95d commit b90971f
Show file tree
Hide file tree
Showing 84 changed files with 12,015 additions and 9 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ on:
push:
tags:
- v[0-9]+.[0-9]+.[0-9]+
branches: [master]
branches: [master,dev-vgpu-1219]
pull_request:
branches: [master]
branches: [master,dev-vgpu-1219]

jobs:
build:
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,7 @@ kubernetes.tar.gz
# test coverage file
coverage.txt

updateso.sh

lib/nvidia/libvgpu/build

25 changes: 20 additions & 5 deletions docker/amd64/Dockerfile.vgpu-ubuntu20.04
Original file line number Diff line number Diff line change
Expand Up @@ -12,33 +12,48 @@
# See the License for the specific language governing permissions and
# limitations under the License.

FROM ubuntu:20.04 as build
ARG NVIDIA_IMAGE=nvidia/cuda:12.2.0-devel-ubuntu20.04
ARG GOLANG_VERSION=1.19.3

FROM ubuntu:20.04 as build
RUN apt-get update && apt-get install -y --no-install-recommends \
g++ \
ca-certificates \
wget && \
rm -rf /var/lib/apt/lists/*

ENV GOLANG_VERSION 1.19.3
RUN wget -nv -O - https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-amd64.tar.gz \
RUN wget -nv -O - https://storage.googleapis.com/golang/go$GOLANG_VERSION.linux-amd64.tar.gz \
| tar -C /usr/local -xz
ENV GOPATH /go
ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH

WORKDIR /go/src/volcano.sh/devices
COPY . .

RUN go env -w GO111MODULE=on
RUN go env -w GOPROXY=https://goproxy.cn,direct
RUN export CGO_LDFLAGS_ALLOW='-Wl,--unresolved-symbols=ignore-in-object-files' && \
go build -ldflags="-s -w" -o volcano-vgpu-device-plugin ./cmd/vgpu

FROM $NVIDIA_IMAGE AS NVBUILD
COPY ./lib/nvidia/libvgpu /libvgpu
WORKDIR /libvgpu
RUN apt-get -y update; apt-get -y install wget
RUN wget https://cmake.org/files/v3.19/cmake-3.19.8-Linux-x86_64.tar.gz
RUN tar -xf cmake-3.19.8-Linux-x86_64.tar.gz
RUN cp /libvgpu/cmake-3.19.8-Linux-x86_64/bin/cmake /libvgpu/cmake-3.19.8-Linux-x86_64/bin/cmake3
ENV PATH="/libvgpu/cmake-3.19.8-Linux-x86_64/bin:${PATH}"
RUN apt-get -y install openssl libssl-dev
RUN rm -rf /libvgpu/build
RUN bash ./build.sh

FROM debian:stretch-slim

ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=utility

COPY --from=build /go/src/volcano.sh/devices/volcano-vgpu-device-plugin /usr/bin/volcano-vgpu-device-plugin
RUN mkdir -p /k8s-vgpu/lib/nvidia
COPY --from=build /go/src/volcano.sh/devices/lib/* /k8s-vgpu/lib/nvidia/
COPY --from=build /go/src/volcano.sh/devices/lib/nvidia/ld.so.preload /k8s-vgpu/lib/nvidia/
COPY --from=NVBUILD /libvgpu/build/libvgpu.so /k8s-vgpu/lib/nvidia/

ENTRYPOINT ["volcano-vgpu-device-plugin"]
2 changes: 1 addition & 1 deletion examples/vgpu-case02.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ spec:
restartPolicy: OnFailure
schedulerName: volcano
containers:
- image: nvidia/cuda:10.1-base-ubuntu18.04
- image: nvidia/cuda:11.2.2-base-ubi8
name: pod1-ctr
command: ["sleep"]
args: ["100000"]
Expand Down
Binary file removed lib/libvgpu.so
Binary file not shown.
File renamed without changes.
76 changes: 76 additions & 0 deletions lib/nvidia/libvgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
cmake_minimum_required (VERSION 2.8.11)
project (libvgpu)

macro(get_git_hash _git_hash)
find_package(Git QUIET)
if(GIT_FOUND)
execute_process(
COMMAND ${GIT_EXECUTABLE} log -1 --pretty=format:%h
OUTPUT_VARIABLE ${_git_hash}
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
WORKING_DIRECTORY
${CMAKE_CURRENT_SOURCE_DIR}
)
endif()
endmacro()

macro(get_git_branch _git_branch)
find_package(Git QUIET)
if(GIT_FOUND)
execute_process(
COMMAND ${GIT_EXECUTABLE} symbolic-ref --short -q HEAD
OUTPUT_VARIABLE ${_git_branch}
OUTPUT_STRIP_TRAILING_WHITESPACE
# ERROR_QUIET
WORKING_DIRECTORY
${CMAKE_CURRENT_SOURCE_DIR}
)
endif()
endmacro()

set(GIT_HASH "")
get_git_hash(GIT_HASH)
message(STATUS "Git hash is ${GIT_HASH}")

if (NOT DEFINED ENV{CI_COMMIT_REF_NAME})
set(GIT_BRANCH "")
get_git_branch(GIT_BRANCH)
else()
set(GIT_BRANCH $ENV{CI_COMMIT_REF_NAME})
endif()

if(GIT_FOUND)
#string(REPLACE "." "_" GIT_BRANCH ${GIT_BRANCH})
#string(REPLACE "-" "_" GIT_BRANCH ${GIT_BRANCH})
#message(STATUS "Git branch is ${GIT_BRANCH}")
endif()

configure_file(src/static_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/config/static_config.h)
include_directories(${CMAKE_CURRENT_BINARY_DIR}/config)

if (NOT DEFINED ENV{CUDA_HOME})
set(CUDA_HOME /usr/local/cuda)
else()
set(CUDA_HOME $ENV{CUDA_HOME})
endif()


if (CMAKE_BUILD_TYPE STREQUAL Debug)
set(LIBRARY_COMPILE_FLAGS -shared -fPIC -g -D_GNU_SOURCE -Wall)
set(TEST_COMPILE_FLAGS -o1)
else()
set(LIBRARY_COMPILE_FLAGS -shared -fPIC -D_GNU_SOURCE -fvisibility=hidden -Wall)
set(TEST_COMPILE_FLAGS -o1)
endif()

include_directories(${CUDA_HOME}/include)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)

add_subdirectory(src)
add_subdirectory(test)

if (BUILD_DOCKER STREQUAL true)
add_subdirectory(dockerfiles)
endif()

88 changes: 88 additions & 0 deletions lib/nvidia/libvgpu/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Hook library for CUDA Environments

![image](docs/images/sample_nvidia-smi.png)


## Build

```bash
sh build.sh
```

## Build in Docker

```bash
docker build .
```

## Usage

_CUDA_DEVICE_MEMORY_LIMIT_ indicates the upper limit of device memory (eg 1g,1024m,1048576k,1073741824)

_CUDA_DEVICE_SM_LIMIT_ indicates the sm utility percentage of each device

```bash
# Add 1GB bytes limit And set max sm utility to 50% for all devices
export LD_PRELOAD=./libvgpu.so
export CUDA_DEVICE_MEMORY_LIMIT=1g
export CUDA_DEVICE_SM_LIMIT=50
```

## Docker Images
```bash
# Make docker image
docker build . -f=dockerfiles/Dockerfile-tf1.8-cu90

# Launch the docker image
export DEVICE_MOUNTS="--device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl"
export LIBRARY_MOUNTS="-v /usr/cuda_files:/usr/cuda_files -v $(which nvidia-smi):/bin/nvidia-smi"

docker run ${LIBRARY_MOUNTS} ${DEVICE_MOUNTS} -it \
-e CUDA_DEVICE_MEMORY_LIMIT=2g \
cuda_vmem:tf1.8-cu90 \
python -c "import tensorflow; tensorflow.Session()"
```

## Log

Use environment variable LIBCUDA_LOG_LEVEL to set the visibility of logs

| LIBCUDA_LOG_LEVEL | description |
| ----------------- | ----------- |
| not set | errors,warnings,messages |
| 3 | infos,errors,warnings,messages |
| 4 | debugs,errors,warnings,messages |

## Test with Frameworks

Run operations which requires at least 4GB device memory, thus will OOM under 1GB limit

- TensorFlow

```bash
python test/python/limit_tensorflow.py --device=0 --tensor_shape=1024,1024,1024
```

- TensorFlow 2.0

```bash
python test/python/limit_tensorflow2.py --device=0 --tensor_shape=1024,1024,1024
```

- Pytorch

```bash
python test/python/limit_pytorch.py --device=0 --tensor_shape=1024,1024,1024
```

- MxNet

```bash
python test/python/limit_mxnet.py --device=0 --tensor_shape=1024,1024,1024
```

## Test Raw APIs

```bash
./test/test_alloc
```
38 changes: 38 additions & 0 deletions lib/nvidia/libvgpu/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash
root_dir=$(cd $(dirname $0); pwd)
cd $root_dir

mkdir -p build; cd build

[[ -z "$J" ]] && J=`nproc | awk '{print int(($0 + 1)/ 2)}'`
CMAKE=${CMAKE-"cmake3"}
CMAKE_OPTIONS=${CMAKE_OPTIONS-""}
#CMAKE_OPTIONS+=" -DMEMORY_LIMIT_DEBUG=1"
CMAKE_OPTIONS+=" -DDLSYM_HOOK_ENABLE=1"
#CMAKE_OPTIONS+=" -DDLSYM_HOOK_DEBUG=1"
CMAKE_OPTIONS+=" -DMULTIPROCESS_LIMIT_ENABLE=1"
CMAKE_OPTIONS+=" -DHOOK_MEMINFO_ENABLE=1"
CMAKE_OPTIONS+=" -DHOOK_NVML_ENABLE=1"
CMAKE_OPTIONS+=" -DCMAKE_BUILD_TYPE=Debug"

#CMAKE_OPTIONS+=" -DBUILD_DOCKER=true"
# gitlab ci
CI_COMMIT_BRANCH=${CI_COMMIT_BRANCH-""}
CI_COMMIT_SHA=${CI_COMMIT_SHA-""}

# jenkins
if [ -z $CI_COMMIT_BRANCH ]; then
CI_COMMIT_BRANCH=${BRANCH_NAME-"unknown"}
fi
if [ -z $CI_COMMIT_SHA ]; then
CI_COMMIT_SHA=$(git describe --abbrev=100 --always)
if [ $? -ne 0 ]; then
CI_COMMIT_SHA="unknown"
fi
fi

echo "CI_COMMIT_BRANCH:${CI_COMMIT_BRANCH}"
echo "CI_COMMIT_SHA:${CI_COMMIT_SHA}"

${CMAKE} .. ${CMAKE_OPTIONS} -DTEST_DEVICE_ID=0
make -j$J
11 changes: 11 additions & 0 deletions lib/nvidia/libvgpu/dockerfiles/Dockerfile.aarch64
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM nvidia/cuda:12.2.0-devel-ubuntu20.04
COPY . /libvgpu
WORKDIR /libvgpu
RUN apt-get -y update; apt-get -y install wget
RUN wget https://cmake.org/files/v3.19/cmake-3.19.8-Linux-aarch64.tar.gz
RUN tar -xf cmake-3.19.8-Linux-aarch64.tar.gz
RUN cp /libvgpu/cmake-3.19.8-Linux-aarch64/bin/cmake /libvgpu/cmake-3.19.8-Linux-aarch64/bin/cmake3
ENV PATH="/libvgpu/cmake-3.19.8-Linux-aarch64/bin:${PATH}"
RUN rm -r ./build
RUN apt-get -y install openssl libssl-dev
RUN bash ./build.sh
11 changes: 11 additions & 0 deletions lib/nvidia/libvgpu/dockerfiles/Dockerfile.x86_64
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM nvidia/cuda:12.2.0-devel-ubuntu20.04
COPY . /libvgpu
WORKDIR /libvgpu
RUN apt-get -y update; apt-get -y install wget
RUN wget https://cmake.org/files/v3.19/cmake-3.19.8-Linux-x86_64.tar.gz
RUN tar -xf cmake-3.19.8-Linux-x86_64.tar.gz
RUN cp /libvgpu/cmake-3.19.8-Linux-x86_64/bin/cmake /libvgpu/cmake-3.19.8-Linux-x86_64/bin/cmake3
ENV PATH="/libvgpu/cmake-3.19.8-Linux-x86_64/bin:${PATH}"
RUN rm -r ./build
RUN apt-get -y install openssl libssl-dev
RUN bash ./build.sh
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 4 additions & 0 deletions lib/nvidia/libvgpu/framework_test/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
FROM alpine:latest

ADD vgpu_test /vgpu_test
ENTRYPOINT ["./vgpu_test.test"]
2 changes: 2 additions & 0 deletions lib/nvidia/libvgpu/framework_test/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# vgpu_test

53 changes: 53 additions & 0 deletions lib/nvidia/libvgpu/framework_test/clients/dockerclient.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package clients

import (
"bytes"
"context"

"github.com/docker/docker/api/types"
"github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/mount"
client "github.com/docker/docker/client"
)

func GetDockerClient() *client.Client {
cli, err := client.NewClientWithOpts(client.FromEnv, client.WithAPIVersionNegotiation())
if err != nil {
panic(err)
}
return cli
}

func RunContainer(ctx context.Context, image string, cmd []string, env []string, mount []mount.Mount, shmSize int64) string {
client := GetDockerClient()
resp, err := client.ContainerCreate(ctx, &container.Config{
Image: image,
Cmd: cmd,
Env: env,
}, &container.HostConfig{
Mounts: mount,
ShmSize: shmSize,
}, nil, nil, "")
if err != nil {
panic(err)
}
if err := client.ContainerStart(ctx, resp.ID, types.ContainerStartOptions{}); err != nil {
panic(err)
}
statusCh, errCh := client.ContainerWait(ctx, resp.ID, container.WaitConditionNotRunning)
select {
case err := <-errCh:
if err != nil {
panic(err)
}
case <-statusCh:
}

out, err := client.ContainerLogs(ctx, resp.ID, types.ContainerLogsOptions{ShowStdout: true})
if err != nil {
panic(err)
}
buf := new(bytes.Buffer)
buf.ReadFrom(out)
return buf.String()
}
Loading

0 comments on commit b90971f

Please sign in to comment.