Skip to content

Commit

Permalink
Cherry pick (batch 2) to rel-1.5.1 (#5290)
Browse files Browse the repository at this point in the history
* remove implicit linking of tensorrt and dnnl ep shared libs (#5262)
* Update DirectML Nuget to 1.3.0 (#5274)
* Update PyTorch TransformerModel sample (#5275)
* Insert telemetry template into GPU build, add telemry build switches. (#5278)
* Synchronize training dependency versions between Docker image and Python wheel (#5261)
* Downgrade GCC (#5269)
* Remove --enable_symbolic_shape_infer_tests to fix linux ci pipeline build error.

Co-authored-by: Edward Chen
Co-authored-by: George Wu <jywu@microsoft.com>
Co-authored-by: Dwayne Robinson <dwayner@microsoft.com>
Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Co-authored-by: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Co-authored-by: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Co-authored-by: Changming Sun <chasun@microsoft.com>
  • Loading branch information
7 people authored Sep 25, 2020
1 parent 389cca7 commit c00e13a
Show file tree
Hide file tree
Showing 57 changed files with 651 additions and 334 deletions.
11 changes: 6 additions & 5 deletions BUILD.md
Original file line number Diff line number Diff line change
Expand Up @@ -1103,12 +1103,13 @@ Dockerfile instructions are available [here](./dockerfiles#migraphx)

The default NVIDIA GPU build requires CUDA runtime libraries installed on the system:

* CUDA 10.2
* cuDNN 7.6.5
* NCCL v2.7.8
* OpenMPI 4.0.4
* [CUDA](https://developer.nvidia.com/cuda-toolkit) 10.2
* [cuDNN](https://developer.nvidia.com/cudnn) 8.0
* [NCCL](https://developer.nvidia.com/nccl) 2.7
* [OpenMPI](https://www.open-mpi.org/) 4.0.4
* See [install_openmpi.sh](./tools/ci_build/github/linux/docker/scripts/install_openmpi.sh)

The official dependency versions are specified in [Dockerfile.training](./dockerfiles/Dockerfile.training).
These dependency versions should reflect what is in [Dockerfile.training](./dockerfiles/Dockerfile.training).

## Build instructions

Expand Down
2 changes: 1 addition & 1 deletion cmake/external/dml.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/../NuGet.config)
set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/../packages.config)
get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/../packages ABSOLUTE)
set(DML_PACKAGE_DIR ${PACKAGES_DIR}/DirectML.3.0.0)
set(DML_PACKAGE_DIR ${PACKAGES_DIR}/DirectML.1.3.0)
set(DML_SHARED_LIB DirectML.dll)

# Restore nuget packages, which will pull down the DirectML redist package
Expand Down
5 changes: 1 addition & 4 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -397,9 +397,7 @@ set(ONNXRUNTIME_TEST_LIBS
${ONNXRUNTIME_INTEROP_TEST_LIBS}
${onnxruntime_libs}
${PROVIDERS_CUDA}
# These providers are shared libraries now, so aren't linked this way anymore:
${PROVIDERS_DNNL}
${PROVIDERS_TENSORRT}
# TENSORRT and DNNL are explicitly linked at runtime
${PROVIDERS_MIGRAPHX}
${PROVIDERS_NGRAPH}
${PROVIDERS_OPENVINO}
Expand Down Expand Up @@ -433,7 +431,6 @@ if(onnxruntime_USE_TENSORRT)
list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/tensorrt/*)
list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_tensorrt)
list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_tensorrt onnxruntime_providers_shared)
list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_tensorrt)
endif()

if(onnxruntime_USE_NNAPI_BUILTIN)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ docker run --gpus all --rm \
-e "PackageName=$PackageName" \
-e "RunTestCsharp=$RunTestCsharp" \
-e "RunTestNative=$RunTestNative" \
onnxruntimeregistry.azurecr.io/internal/azureml/onnxruntimegpubuild:ch35 \
onnxruntimeregistry.azurecr.io/internal/azureml/onnxruntimecentosgpubuild:ch5h \
/bin/bash /onnxruntime_src/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh \
/home/onnxruntimedev/$NUGET_REPO_DIRNAME /onnxruntime_src /home/onnxruntimedev $CurrentOnnxRuntimeVersion
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,6 @@ docker run --rm \
-e "DisableMlOps=$DISABLEMLOPS" \
-e "RunTestCsharp=$RunTestCsharp" \
-e "RunTestNative=$RunTestNative" \
onnxruntimeregistry.azurecr.io/internal/azureml/onnxruntimecpubuild:ch36 \
onnxruntimeregistry.azurecr.io/internal/azureml/onnxruntimecentoscpubuild:ch5g \
/bin/bash /onnxruntime_src/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh \
/home/onnxruntimedev/$NUGET_REPO_DIRNAME /onnxruntime_src /home/onnxruntimedev $CurrentOnnxRuntimeVersion
4 changes: 2 additions & 2 deletions dockerfiles/Dockerfile.training
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ ARG OPENMPI_PATH=/opt/openmpi-${OPENMPI_VERSION}
ARG COMMIT=master

# cuda development image for building sources
FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 as builder
FROM nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04 as builder

# set location for builds
WORKDIR /stage
Expand Down Expand Up @@ -155,7 +155,7 @@ RUN pip install azureml-defaults transformers==2.11.0 msgpack==1.0.0 tensorboard

# switch to cuda runtime environment
# note: launch with --gpus all or nvidia-docker
FROM nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04
FROM nvidia/cuda:10.2-cudnn8-runtime-ubuntu18.04
WORKDIR /stage

# install ucx
Expand Down
2 changes: 1 addition & 1 deletion docs/execution_providers/DirectML-ExecutionProvider.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ When used standalone, the DirectML API is a low-level DirectX 12 library and is

The *DirectML Execution Provider* is an optional component of ONNX Runtime that uses DirectML to accelerate inference of ONNX models. The DirectML execution provider is capable of greatly improving evaluation time of models using commodity GPU hardware, without sacrificing broad hardware support or requiring vendor-specific extensions to be installed.

The DirectML Execution Provider currently uses DirectML version 2.1.0.
The DirectML Execution Provider currently uses DirectML version 1.3.0.

## Table of contents

Expand Down
54 changes: 52 additions & 2 deletions onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,57 @@
"^test_operator_mm",
"^test_operator_non_float_params",
"^test_operator_params",
"^test_operator_pow"
"^test_operator_pow",
"^test_nllloss_NC",
"^test_nllloss_NCd1",
"^test_nllloss_NCd1d2",
"^test_nllloss_NCd1d2d3d4d5_none_no_weight",
"^test_nllloss_NCd1d2d3d4d5_none_no_weight_expanded",
"^test_nllloss_NCd1d2d3_none_no_weight_negative_ii",
"^test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded",
"^test_nllloss_NCd1d2d3_sum_weight_high_ii",
"^test_nllloss_NCd1d2d3_sum_weight_high_ii_expanded",
"^test_nllloss_NCd1d2_expanded",
"^test_nllloss_NCd1d2_reduction_mean",
"^test_nllloss_NCd1d2_reduction_mean_expanded",
"^test_nllloss_NCd1d2_reduction_sum",
"^test_nllloss_NCd1d2_reduction_sum_expanded",
"^test_nllloss_NCd1d2_with_weight_reduction_sum_ii",
"^test_nllloss_NCd1d2_with_weight_reduction_sum_ii_expanded",
"^test_nllloss_NCd1_expanded",
"^test_nllloss_NC_expanded",
"^test_sce_mean_3d",
"^test_sce_mean_3d_expanded",
"^test_sce_mean_3d_log_prob",
"^test_sce_mean_3d_log_prob_expanded",
"^test_sce_mean",
"^test_sce_mean_expanded",
"^test_sce_mean_log_prob",
"^test_sce_mean_log_prob_expanded",
"^test_sce_NCd1d2d3d4d5_mean_weight",
"^test_sce_NCd1d2d3d4d5_mean_weight_expanded",
"^test_sce_NCd1d2d3d4d5_mean_weight_log_prob",
"^test_sce_NCd1d2d3d4d5_mean_weight_log_prob_expanded",
"^test_sce_NCd1d2d3d4d5_none_no_weight",
"^test_sce_NCd1d2d3d4d5_none_no_weight_expanded",
"^test_sce_NCd1d2d3d4d5_none_no_weight_log_prob",
"^test_sce_NCd1d2d3d4d5_none_no_weight_log_prob_expanded",
"^test_sce_NCd1d2d3_none_no_weight_negative_ii",
"^test_sce_NCd1d2d3_none_no_weight_negative_ii_expanded",
"^test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob",
"^test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob_expanded",
"^test_sce_NCd1d2d3_sum_weight_high_ii",
"^test_sce_NCd1d2d3_sum_weight_high_ii_expanded",
"^test_sce_NCd1d2d3_sum_weight_high_ii_log_prob",
"^test_sce_NCd1d2d3_sum_weight_high_ii_log_prob_expanded",
"^test_sce_none",
"^test_sce_none_expanded",
"^test_sce_none_log_prob",
"^test_sce_none_log_prob_expanded",
"^test_sce_sum",
"^test_sce_sum_expanded",
"^test_sce_sum_log_prob",
"^test_sce_sum_log_prob_expanded"
],
"unsupported_usages": [
"^test_convtranspose_1d", // ConvTransponse supports 4-D only
Expand All @@ -178,4 +228,4 @@
"^test_bitshift_right_uint16",
"^test_bitshift_left_uint16"
]
}
}
3 changes: 2 additions & 1 deletion orttraining/orttraining/test/gradient/gradient_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,8 @@ TEST(GradientCheckerTest, SubGrad) {
TestBroadcastableBinaryOpGrad("Sub");
}

TEST(GradientCheckerTest, MulGrad) {
//flaky
TEST(GradientCheckerTest, DISABLED_MulGrad) {
TestBroadcastableBinaryOpGrad("Mul");
}

Expand Down
2 changes: 1 addition & 1 deletion packages.config
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="DirectML" version="3.0.0" targetFramework="native" />
<package id="DirectML" version="1.3.0" targetFramework="native" />
<package id="GoogleTestAdapter" version="0.17.1" targetFramework="net46" />
</packages>
16 changes: 14 additions & 2 deletions samples/python/pytorch_transformer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,24 @@ This example was adapted from Pytorch's [Sequence-to-Sequence Modeling with nn.T

## Running PyTorch version

```python
```bash
python pt_model.py
```

## Running ONNX Runtime version

```python
```bash
python ort_model.py
```

## Optional arguments

| Argument | Description | Default |
| :---------------- | :-----------------------------------------------------: | --------: |
| --batch-size | input batch size for training | 20 |
| --test-batch-size | input batch size for testing | 20 |
| --epochs | number of epochs to train | 2 |
| --lr | learning rate | 0.001 |
| --no-cuda | disables CUDA training | False |
| --seed | random seed | 1 |
| --log-interval | how many batches to wait before logging training status | 200 |
85 changes: 85 additions & 0 deletions samples/python/pytorch_transformer/ort_train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import argparse
import math
import torch
import onnxruntime

from utils import prepare_data, get_batch
from ort_utils import my_loss, transformer_model_description_dynamic_axes
from pt_model import TransformerModel


def train(trainer, data_source, device, epoch, args, bptt=35):
total_loss = 0.
for batch, i in enumerate(range(0, data_source.size(0) - 1, bptt)):
data, targets = get_batch(data_source, i)

loss, pred = trainer.train_step(data, targets)
total_loss += loss.item()
if batch % args.log_interval == 0 and batch > 0:
cur_loss = total_loss / args.log_interval
print('epoch {:3d} | {:5d}/{:5d} batches | loss {:5.2f}'.format(epoch,
batch,
len(data_source) // bptt,
cur_loss))
total_loss = 0


def evaluate(trainer, data_source, bptt=35):
total_loss = 0.
with torch.no_grad():
for i in range(0, data_source.size(0) - 1, bptt):
data, targets = get_batch(data_source, i)
loss, pred = trainer.eval_step(data, targets)
total_loss += len(data) * loss.item()
return total_loss / (len(data_source) - 1)


if __name__ == "__main__":
# Training settings
parser = argparse.ArgumentParser(description='PyTorch TransformerModel example')
parser.add_argument('--batch-size', type=int, default=20, metavar='N',
help='input batch size for training (default: 20)')
parser.add_argument('--test-batch-size', type=int, default=20, metavar='N',
help='input batch size for testing (default: 20)')
parser.add_argument('--epochs', type=int, default=2, metavar='N',
help='number of epochs to train (default: 2)')
parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
help='learning rate (default: 0.001)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=200, metavar='N',
help='how many batches to wait before logging training status (default: 200)')

# Basic setup
args = parser.parse_args()
if not args.no_cuda and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
torch.manual_seed(args.seed)
onnxruntime.set_seed(args.seed)

# Model
optim_config = onnxruntime.training.optim.SGDConfig(lr=args.lr)
model_desc = transformer_model_description_dynamic_axes()
model = TransformerModel(28785, 200, 2, 200, 2, 0.2).to(device)

# Preparing data
train_data, val_data, test_data = prepare_data(device, args.batch_size, args.test_batch_size)
trainer = onnxruntime.training.ORTTrainer(model, model_desc, optim_config, loss_fn=my_loss)

# Train
for epoch in range(1, args.epochs + 1):
train(trainer, train_data, device, epoch, args)
val_loss = evaluate(trainer, val_data)
print('-' * 89)
print('| end of epoch {:3d} | valid loss {:5.2f} | '.format(epoch, val_loss))
print('-' * 89)

# Evaluate
test_loss = evaluate(trainer, test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f}'.format(test_loss))
print('=' * 89)
92 changes: 92 additions & 0 deletions samples/python/pytorch_transformer/pt_train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import argparse
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

from utils import prepare_data, get_batch
from pt_model import TransformerModel


def train(model, data_source, device, epoch, args, bptt=35):
total_loss = 0.
model.train()
for batch, i in enumerate(range(0, data_source.size(0) - 1, bptt)):
data, targets = get_batch(data_source, i)

optimizer.zero_grad()
output = model(data)
loss = criterion(output.view(-1, 28785), targets)
loss.backward()
optimizer.step()

total_loss += loss.item()
if batch % args.log_interval == 0 and batch > 0:
cur_loss = total_loss / args.log_interval
print('epoch {:3d} | {:5d}/{:5d} batches | loss {:5.2f}'.format(epoch,
batch,
len(data_source) // bptt,
cur_loss))
total_loss = 0


def evaluate(model, data_source, criterion, bptt=35):
total_loss = 0.
model.eval()
with torch.no_grad():
for i in range(0, data_source.size(0) - 1, bptt):
data, targets = get_batch(data_source, i)
output = model(data)
output_flat = output.view(-1, 28785)
total_loss += len(data) * criterion(output_flat, targets).item()
return total_loss / (len(data_source) - 1)


if __name__ == "__main__":
# Training settings
parser = argparse.ArgumentParser(description='PyTorch TransformerModel example')
parser.add_argument('--batch-size', type=int, default=20, metavar='N',
help='input batch size for training (default: 20)')
parser.add_argument('--test-batch-size', type=int, default=20, metavar='N',
help='input batch size for testing (default: 20)')
parser.add_argument('--epochs', type=int, default=2, metavar='N',
help='number of epochs to train (default: 2)')
parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
help='learning rate (default: 0.001)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=200, metavar='N',
help='how many batches to wait before logging training status (default: 200)')

# Basic setup
args = parser.parse_args()
if not args.no_cuda and torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
torch.manual_seed(args.seed)

# Model
criterion = nn.CrossEntropyLoss()
lr = 0.001
model = TransformerModel(28785, 200, 2, 200, 2, 0.2).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

# Preparing data
train_data, val_data, test_data = prepare_data(device, args.batch_size, args.test_batch_size)

# Train
for epoch in range(1, args.epochs + 1):
train(model, train_data, device, epoch, args)
val_loss = evaluate(model, val_data, criterion)
print('-' * 89)
print('| end of epoch {:3d} | valid loss {:5.2f} | '.format(epoch, val_loss))
print('-' * 89)

# Evaluate
test_loss = evaluate(model, test_data, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f}'.format(test_loss))
print('=' * 89)
Loading

0 comments on commit c00e13a

Please sign in to comment.