Skip to content

Commit

Permalink
Merge branch 'main' into ET-198
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkim-det committed Jun 28, 2024
2 parents 7999341 + 1e9dc42 commit f1ebbf8
Show file tree
Hide file tree
Showing 249 changed files with 4,277 additions and 1,658 deletions.
76 changes: 9 additions & 67 deletions .circleci/real_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ commands:
- when:
condition: <<parameters.tf2>>
steps:
- run: docker pull determinedai/pytorch-ngc-dev:8c90e80
- run: docker pull determinedai/pytorch-ngc-dev:0e43056

login-docker:
parameters:
Expand Down Expand Up @@ -1394,14 +1394,6 @@ commands:
no_output_timeout: 30m
command: make package-ee

make-package-small:
steps:
- attach_workspace:
at: .
- run:
no_output_timeout: 30m
command: make -C master package-small

install-devcluster:
steps:
- run: pip install git+https://github.com/determined-ai/devcluster.git@v1.1.0#egg=devcluster
Expand Down Expand Up @@ -1730,39 +1722,6 @@ jobs:
fi
- run: mkdir /tmp/pkgs && cp -v */dist/*.{rpm,deb,tar.gz} /tmp/pkgs

package-and-push-system-dev-small:
docker:
- image: <<pipeline.parameters.docker-image>>
environment:
GO111MODULE: "on"
resource_class: xlarge
steps:
- checkout
- add-and-fetch-upstream
- skip-if-only-docs
- skip-if-only-github
- skip-if-only-webui
- attach_workspace:
at: .
- reinstall-go
- setup-python-venv:
install-python: false
determined: true
executor: <<pipeline.parameters.docker-image>>
- setup_remote_docker:
version: previous
- login-docker:
username: ${DOCKER_USER}
password: ${DOCKER_PASS}
- pre-package-and-push-system:
check: false
- make-package-small
- run: tools/scripts/retry.sh make -C master publish-dev-small
- persist_to_workspace:
root: .
paths:
- harness/dist

package-and-push-system-rc:
docker:
- image: <<pipeline.parameters.docker-image>>
Expand Down Expand Up @@ -2401,7 +2360,7 @@ jobs:

test-unit-harness-gpu-tf:
docker:
- image: determinedai/tensorflow-ngc-dev:8c90e80
- image: determinedai/tensorflow-ngc-dev:0e43056
resource_class: determined-ai/container-runner-gpu
steps:
- run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts
Expand All @@ -2428,7 +2387,7 @@ jobs:

test-unit-harness-pytorch2-gpu:
docker:
- image: determinedai/pytorch-ngc-dev:8c90e80
- image: determinedai/pytorch-ngc-dev:0e43056
resource_class: determined-ai/container-runner-gpu
steps:
- run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts
Expand All @@ -2455,7 +2414,7 @@ jobs:

test-unit-harness-pytorch2-cpu:
docker:
- image: determinedai/pytorch-ngc-dev:8c90e80
- image: determinedai/pytorch-ngc-dev:0e43056
steps:
- run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts
- checkout
Expand All @@ -2481,7 +2440,7 @@ jobs:

test-unit-harness-gpu-parallel:
docker:
- image: determinedai/pytorch-ngc-dev:8c90e80
- image: determinedai/pytorch-ngc-dev:0e43056
resource_class: determined-ai/container-runner-multi-gpu
steps:
- run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts
Expand All @@ -2508,7 +2467,7 @@ jobs:

test-unit-harness-gpu-deepspeed:
docker:
- image: determinedai/pytorch-ngc-dev:8c90e80
- image: determinedai/pytorch-ngc-dev:0e43056
resource_class: determined-ai/container-runner-gpu
steps:
- run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts
Expand Down Expand Up @@ -3747,7 +3706,7 @@ jobs:
type: string
default: "1"
environment-image:
default: determinedai/pytorch-ngc-dev:8c90e80
default: determinedai/pytorch-ngc-dev:0e43056
type: string
accel-node-taints:
type: string
Expand Down Expand Up @@ -4897,25 +4856,6 @@ workflows:
requires:
- package-and-push-system-dev-ee

test-e2e-gke-shared-cluster:
unless:
or:
- << pipeline.parameters.do_nightly_tests >>
- equal: [ api, << pipeline.trigger_source >> ]
jobs:
- package-and-push-system-dev-small

- test-e2e-shared-cluster:
name: test-e2e-shared-cluster-cpu
context:
- gcp-shared-cluster
- gcp-ci-cluster-default-user-credentials
requires:
- package-and-push-system-dev-small
parallelism: 3
mark: "e2e_gpu and not gpu_required"
test-type: cpu

test-e2e-longrunning:
<<: *do-not-run-on-manual-trigger
jobs:
Expand Down Expand Up @@ -5688,6 +5628,8 @@ workflows:
port: 5432
name: determined
password: ${HPC_DB_PASSWD}
security:
initial_user_password: ${INITIAL_USER_PASSWORD}
- terminate-vpc-circleci:
context: ["gcp"]

Expand Down
2 changes: 1 addition & 1 deletion .circleci/scripts/pull_image_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ spec:
spec:
containers:
- name: pull-docker-daemonset
image: determinedai/pytorch-ngc-dev:8c90e80
image: determinedai/pytorch-ngc-dev:0e43056
command: ["/bin/bash"]
args: ["echo", "test"]
resources:
Expand Down
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
/e2e_tests/tests/requirements.txt @determined-ai/model-dev
/e2e_tests/tests/experiment @determined-ai/model-dev
/e2e_tests/tests/nightly @determined-ai/model-dev
/e2e_tests/tests/environment @determined-ai/model-dev

# Backend owns some e2e tests.
/e2e_tests/tests/cluster @determined-ai/backend
Expand Down
2 changes: 0 additions & 2 deletions agent/internal/containers/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,6 @@ func (m *Manager) SignalContainer(ctx context.Context, msg aproto.SignalContaine
func (m *Manager) Detach() {
m.mu.RLock()
for _, c := range m.containers {
c := c
m.wg.Go(func(_ context.Context) {
c.Detach()
})
Expand All @@ -261,7 +260,6 @@ func (m *Manager) Detach() {
func (m *Manager) Close() {
m.mu.RLock()
for _, c := range m.containers {
c := c
m.wg.Go(func(_ context.Context) {
c.Stop()
})
Expand Down
15 changes: 9 additions & 6 deletions codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,31 +12,34 @@ coverage:
backend:
target: 42%
threshold: 3%
flags:
flags:
- backend
informational: false
patch:
patch:
default:
informational: true
backend:
backend:
target: 80%
threshold: 5%
flags:
flags:
- backend
informational: false
only_pulls: true

flags:
backend:
carryforward: true

github_checks:
annotations: false

comment:
comment:
layout: "diff, flags, files"
behavior: default

parsers:
go:
partials_as_hits: true

ignore:
- "harness/determined/common/api/bindings.py"
2 changes: 1 addition & 1 deletion docs/_shared/basic-installation.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
- Accept the default username of ``determined``.
- Click **Sign In**.

After signing in, you'll need to create a :ref:`strong password <strong-password>`.
After signing in, create a :ref:`strong password <strong-password>`.
4 changes: 3 additions & 1 deletion docs/_shared/note-pip-install-determined.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.. note::

The command, ``pip install determined``, installs the ``determined`` library which includes the Determined command-line interface (CLI).
When deploying locally, the system prompts you to set a strong password.

The command, ``pip install determined``, installs the ``determined`` library which includes the Determined command-line interface (CLI).
2 changes: 1 addition & 1 deletion docs/model-dev-guide/api-guides/apis-howto/api-core-ug.rst
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ CD into the directory and run this command:

Open the Determined WebUI by navigating to the master URL. One way to do this is to navigate to
``http://localhost:8080/``, accept the default username of ``determined``, and click **Sign In**.
After signing in, you'll need to create a :ref:`strong password <strong-password>`.
After signing in, create a :ref:`strong password <strong-password>`.

.. include:: ../../../_shared/note-local-dtrain-job.txt

Expand Down
8 changes: 4 additions & 4 deletions docs/model-dev-guide/prepare-container/custom-env.rst
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,9 @@ Default Images
- - Environment
- File Name
- - CPUs
- ``determinedai/pytorch-ngc-dev:8c90e80``
- ``determinedai/pytorch-ngc-dev:0e43056``
- - NVIDIA GPUs
- ``determinedai/pytorch-ngc-dev:8c90e80``
- ``determinedai/pytorch-ngc-dev:0e43056``
- - AMD GPUs
- ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4``

Expand Down Expand Up @@ -155,7 +155,7 @@ Example Dockerfile that installs custom ``conda``-, ``pip``-, and ``apt``-based
.. code:: bash
# Determined Image
FROM determinedai/tensorflow-ngc-dev:8c90e80
FROM determinedai/tensorflow-ngc-dev:0e43056
# Custom Configuration
RUN apt-get update && \
Expand Down Expand Up @@ -216,7 +216,7 @@ environments using :ref:`custom images <custom-docker-images>`:
.. code:: bash
# Determined Image
FROM determinedai/pytorch-ngc-dev:8c90e80
FROM determinedai/pytorch-ngc-dev:0e43056
# Create a virtual environment
RUN conda create -n myenv python=3.8
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Determined supports both TensorFlow 1 and 2. The version of TensorFlow used for
experiment is controlled by the configured container image. Determined provides prebuilt Docker
images that include TensorFlow 2+, 1.15, and 2.8, respectively:

- ``determinedai/tensorflow-ngc-dev:8c90e80``
- ``determinedai/tensorflow-ngc-dev:0e43056``
- ``determinedai/environments:cuda-10.2-pytorch-1.7-tf-1.15-gpu-0.21.2``
- ``determinedai/environments:cuda-11.2-tf-2.8-gpu-0.29.1``

Expand Down
4 changes: 2 additions & 2 deletions docs/reference/deploy/helm-config-reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -197,13 +197,13 @@

- ``cpuImage``: Sets the default Docker image for all non-GPU tasks. If a Docker image is
specified in the :ref:`experiment config <exp-environment-image>` this default is overriden.
Defaults to: ``determinedai/pytorch-ngc-dev:8c90e80``.
Defaults to: ``determinedai/pytorch-ngc-dev:0e43056``.

- ``startupHook``: An optional inline script that will be executed as part of task set up.

- ``gpuImage``: Sets the default Docker image for all GPU tasks. If a Docker image is specified
in the :ref:`experiment config <exp-environment-image>` this default is overriden. Defaults
to: ``determinedai/pytorch-ngc-dev:8c90e80``.
to: ``determinedai/pytorch-ngc-dev:0e43056``.

- ``logPolicies``: Sets log policies for trials. For details, visit :ref:`log_policies
<experiment-config-min-validation-period>`.
Expand Down
4 changes: 2 additions & 2 deletions docs/reference/deploy/master-config-reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,12 @@ configure different container images for NVIDIA GPU tasks using the ``cuda`` key
Determined 0.17.6), CPU tasks using ``cpu`` key, and ROCm (AMD GPU) tasks using the ``rocm`` key.
Default values:

- ``determinedai/pytorch-ngc-dev:8c90e80`` for NVIDIA GPUs and for CPUs.
- ``determinedai/pytorch-ngc-dev:0e43056`` for NVIDIA GPUs and for CPUs.
- ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4`` for ROCm.

For TensorFlow users, we provide an image that must be referenced in the experiment configuration:

- ``determinedai/tensorflow-ngc-dev:8c90e80`` for NVIDIA GPUs and for CPUs.
- ``determinedai/tensorflow-ngc-dev:0e43056`` for NVIDIA GPUs and for CPUs.

``environment_variables``
=========================
Expand Down
4 changes: 2 additions & 2 deletions docs/reference/experiment-config-reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1333,12 +1333,12 @@ Optional. The Docker image to use when executing the workload. This image must b
container images for NVIDIA GPU tasks using ``cuda`` key (``gpu`` prior to 0.17.6), CPU tasks using
``cpu`` key, and ROCm (AMD GPU) tasks using ``rocm`` key. Default values:

- ``determinedai/pytorch-ngc-dev:8c90e80`` for NVIDIA GPUs and for CPUs.
- ``determinedai/pytorch-ngc-dev:0e43056`` for NVIDIA GPUs and for CPUs.
- ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4`` for ROCm.

For TensorFlow users, we provide an image that must be referenced in the experiment configuration:

- ``determinedai/tensorflow-ngc-dev:8c90e80`` for NVIDIA GPUs and for CPUs.
- ``determinedai/tensorflow-ngc-dev:0e43056`` for NVIDIA GPUs and for CPUs.

When the cluster is configured with :ref:`resource_manager.type: slurm
<cluster-configuration-slurm>` and ``container_run_type: singularity``, images are executed using
Expand Down
4 changes: 2 additions & 2 deletions docs/reference/job-config-reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,13 @@ The following configuration settings are supported:
different container images for NVIDIA GPU tasks using ``cuda`` key (``gpu`` prior to 0.17.6),
CPU tasks using ``cpu`` key, and ROCm (AMD GPU) tasks using ``rocm`` key. Default values:

- ``determinedai/pytorch-ngc-dev:8c90e80`` for NVIDIA GPUs and for CPUs.
- ``determinedai/pytorch-ngc-dev:0e43056`` for NVIDIA GPUs and for CPUs.
- ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4`` for ROCm.

For TensorFlow users, we provide an image that must be referenced in the experiment
configuration:

- ``determinedai/tensorflow-ngc-dev:8c90e80`` for NVIDIA GPUs and for CPUs.
- ``determinedai/tensorflow-ngc-dev:0e43056`` for NVIDIA GPUs and for CPUs.

- ``force_pull_image``: Forcibly pull the image from the Docker registry and bypass the Docker
cache. Defaults to ``false``.
Expand Down
4 changes: 3 additions & 1 deletion docs/release-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ This improvement facilitates better resource tracking and management within Kube

Configuration: Introduce a DCGM Helm chart and Prometheus configuration to the
``tools/observability`` directory. Additionally, two new dashboards, "API Monitoring" and "Resource
Utilization", have been added to improve observability and operational insight.
Utilization", have been added to improve observability and operational insight. Visit `Kubernetes
Observability <https://docs.determined.ai/latest/integrations/observability/_index.html>`__ for a
complete setup guide.

- WebUI: Allow users to create and manage configuration templates through the WebUI.
- Commands: Commands now support automatically executing a ``startup-hook.sh`` script if it is
Expand Down
6 changes: 6 additions & 0 deletions docs/release-notes/idle.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
:orphan:

**Bug Fixes**

- Notebooks: Fix an issue introduced in 0.30.0 where idle notebooks were not terminated as
expected.
8 changes: 8 additions & 0 deletions docs/setup-cluster/_index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ Ideal for getting started with Determined.
- :ref:`install-using-homebrew`
- :ref:`install-using-wsl`

.. note::

**Initial Password**: When you deploy locally using ``det deploy local`` with ``master-up`` or
``cluster-up`` commands and no user accounts have been created yet, an initial password will be
automatically generated and shown to the user (with the option to change it) if neither
``security.initial_user_password`` in ``master.yaml`` nor the ``--initial-user-password`` CLI
flag is present.

******************
Determined Agent
******************
Expand Down
4 changes: 2 additions & 2 deletions docs/setup-cluster/deploy-cluster/slurm/singularity.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ by default in this version of Determined are described below.
- - Environment
- File Name
- - CPUs
- ``determinedai/pytorch-ngc-dev:8c90e80``
- ``determinedai/pytorch-ngc-dev:0e43056``
- - NVIDIA GPUs
- ``determinedai/pytorch-ngc-dev:8c90e80``
- ``determinedai/pytorch-ngc-dev:0e43056``
- - AMD GPUs
- ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-622d512``

Expand Down
Loading

0 comments on commit f1ebbf8

Please sign in to comment.