NVIDIA · mzient · Jan 29, 2024 · Dec 8, 2023 · Dec 13, 2023 · Dec 15, 2023
diff --git a/dali/benchmark/caffe2_alexnet_bench.cc b/dali/benchmark/caffe2_alexnet_bench.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2017-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -105,20 +105,17 @@ BENCHMARK_DEFINE_F(C2Alexnet, Caffe2Pipe)(benchmark::State& st) { // NOLINT
 
  // Run once to allocate the memory
  Workspace ws;
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  while (st.KeepRunning()) {
  if (st.iterations() == 1 && pipelined) {
  // We will start he processing for the next batch
  // immediately after issueing work to the gpu to
  // pipeline the cpu/copy/gpu work
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  }
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  if (st.iterations() == st.max_iterations && pipelined) {
@@ -236,20 +233,17 @@ BENCHMARK_DEFINE_F(C2Alexnet, HybridPipe)(benchmark::State& st) { // NOLINT
 
  // Run once to allocate the memory
  Workspace ws;
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  while (st.KeepRunning()) {
  if (st.iterations() == 1 && pipelined) {
  // We will start he processing for the next batch
  // immediately after issueing work to the gpu to
  // pipeline the cpu/copy/gpu work
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  }
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  if (st.iterations() == st.max_iterations && pipelined) {

diff --git a/dali/benchmark/caffe_alexnet_bench.cc b/dali/benchmark/caffe_alexnet_bench.cc
@@ -107,20 +107,17 @@ BENCHMARK_DEFINE_F(Alexnet, CaffePipe)(benchmark::State& st) { // NOLINT
 
  // Run once to allocate the memory
  Workspace ws;
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  while (st.KeepRunning()) {
  if (st.iterations() == 1 && pipelined) {
  // We will start he processing for the next batch
  // immediately after issueing work to the gpu to
  // pipeline the cpu/copy/gpu work
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  }
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  if (st.iterations() == st.max_iterations && pipelined) {
@@ -238,20 +235,17 @@ BENCHMARK_DEFINE_F(Alexnet, HybridPipe)(benchmark::State& st) { // NOLINT
 
  // Run once to allocate the memory
  Workspace ws;
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  while (st.KeepRunning()) {
  if (st.iterations() == 1 && pipelined) {
  // We will start he processing for the next batch
  // immediately after issueing work to the gpu to
  // pipeline the cpu/copy/gpu work
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  }
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  if (st.iterations() == st.max_iterations && pipelined) {

diff --git a/dali/benchmark/checkpointing_bench.cc b/dali/benchmark/checkpointing_bench.cc
@@ -40,13 +40,11 @@ class CheckpointingOverhead : public DALIBenchmark {
  Workspace ws;
 
  // Warmup
- pipe->RunCPU();
- pipe->RunGPU();
+ pipe->Run();
  pipe->Outputs(&ws);
 
  while (st.KeepRunning()) {
- pipe->RunCPU();
- pipe->RunGPU();
+ pipe->Run();
  pipe->Outputs(&ws);
  if (policy == CheckpointingPolicy::SaveEveryIter) {
  volatile auto cpt = pipe->GetCheckpoint();

diff --git a/dali/benchmark/decoder_bench.cc b/dali/benchmark/decoder_bench.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2017-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -56,8 +56,7 @@ class DecoderBench : public DALIBenchmark {
  // Run once to allocate the memory
  Workspace ws;
  pipe.SetExternalInput("raw_jpegs", data);
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  while (st.KeepRunning()) {
@@ -66,13 +65,11 @@ class DecoderBench : public DALIBenchmark {
  // immediately after issueing work to the gpu to
  // pipeline the cpu/copy/gpu work
  pipe.SetExternalInput("raw_jpegs", data);
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  }
 
  pipe.SetExternalInput("raw_jpegs", data);
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  if (st.iterations() == st.max_iterations) {

diff --git a/dali/benchmark/file_reader_alexnet_bench.cc b/dali/benchmark/file_reader_alexnet_bench.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2017-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -106,20 +106,17 @@ BENCHMARK_DEFINE_F(FileReaderAlexnet, CaffePipe)(benchmark::State& st) { // NOLI
 
  // Run once to allocate the memory
  Workspace ws;
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  while (st.KeepRunning()) {
  if (st.iterations() == 1 && pipelined) {
  // We will start he processing for the next batch
  // immediately after issueing work to the gpu to
  // pipeline the cpu/copy/gpu work
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  }
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  if (st.iterations() == st.max_iterations && pipelined) {

diff --git a/dali/benchmark/file_reader_fast_forward_bench.cc b/dali/benchmark/file_reader_fast_forward_bench.cc
@@ -70,8 +70,7 @@ BENCHMARK_DEFINE_F(FileReaderFastForward, FastForward)(benchmark::State& st) { /
 
  Workspace ws;
  for (int i = 0; i < snapshot_at; i++) {
- pipe->RunCPU();
- pipe->RunGPU();
+ pipe->Run();
  pipe->Outputs(&ws);
  }
 
@@ -85,8 +84,7 @@ BENCHMARK_DEFINE_F(FileReaderFastForward, FastForward)(benchmark::State& st) { /
  pipe2->RestoreFromCheckpoint(cpt);
 
  st.PauseTiming();
- pipe2->RunCPU();
- pipe2->RunGPU();
+ pipe2->Run();
  pipe2->Outputs(&ws);
  st.ResumeTiming();
  }

diff --git a/dali/benchmark/resnet50_bench.cc b/dali/benchmark/resnet50_bench.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -107,8 +107,7 @@ BENCHMARK_DEFINE_F(RN50, C2Pipe)(benchmark::State& st) { // NOLINT
 
  // Run once to allocate the memory
  Workspace ws;
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  while (st.KeepRunning()) {
@@ -118,11 +117,9 @@ BENCHMARK_DEFINE_F(RN50, C2Pipe)(benchmark::State& st) { // NOLINT
  // immediately after issueing work to the gpu to
  // pipeline the cpu/copy/gpu work
  pipe.SetExternalInput("raw_jpegs", data);
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  }
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  if (st.iterations() == st.max_iterations && pipelined) {
@@ -242,8 +239,7 @@ BENCHMARK_DEFINE_F(RN50, HybridPipe)(benchmark::State& st) { // NOLINT
 
  // Run once to allocate the memory
  Workspace ws;
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  while (st.KeepRunning()) {
@@ -253,11 +249,9 @@ BENCHMARK_DEFINE_F(RN50, HybridPipe)(benchmark::State& st) { // NOLINT
  // immediately after issueing work to the gpu to
  // pipeline the cpu/copy/gpu work
  pipe.SetExternalInput("raw_jpegs", data);
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  }
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  if (st.iterations() == st.max_iterations && pipelined) {
@@ -355,8 +349,7 @@ BENCHMARK_DEFINE_F(RN50, nvJPEGPipe)(benchmark::State& st) { // NOLINT
 
  // Run once to allocate the memory
  Workspace ws;
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  while (st.KeepRunning()) {
@@ -366,11 +359,9 @@ BENCHMARK_DEFINE_F(RN50, nvJPEGPipe)(benchmark::State& st) { // NOLINT
  // immediately after issueing work to the gpu to
  // pipeline the cpu/copy/gpu work
  pipe.SetExternalInput("raw_jpegs", data);
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  }
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  if (st.iterations() == st.max_iterations && pipelined) {

diff --git a/dali/benchmark/resnet50_nvjpeg_bench.cc b/dali/benchmark/resnet50_nvjpeg_bench.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2017-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -89,20 +89,17 @@ BENCHMARK_DEFINE_F(RealRN50, nvjpegPipe)(benchmark::State& st) { // NOLINT
 
  // Run once to allocate the memory
  Workspace ws;
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  while (st.KeepRunning()) {
  if (st.iterations() == 1 && pipelined) {
  // We will start he processing for the next batch
  // immediately after issueing work to the gpu to
  // pipeline the cpu/copy/gpu work
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  }
- pipe.RunCPU();
- pipe.RunGPU();
+ pipe.Run();
  pipe.Outputs(&ws);
 
  if (st.iterations() == st.max_iterations && pipelined) {

diff --git a/dali/c_api/c_api.cc b/dali/c_api/c_api.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -251,6 +251,7 @@ daliCreatePipeline2(daliPipelineHandle *pipe_handle, const char *serialized_pipe
  bool se = separated_execution != 0;
  bool pe = pipelined_execution != 0;
  bool ae = async_execution != 0;
+
  auto pipeline =
  std::make_unique<dali::Pipeline>(std::string(serialized_pipeline, length), max_batch_size,
  num_threads, device_id, pe, prefetch_queue_depth, ae);
@@ -283,26 +284,36 @@ int daliGetMaxBatchSize(daliPipelineHandle_t pipe_handle) {
  return (*pipe_handle)->pipeline->max_batch_size();
 }
 
+int daliInputFeedCount(daliPipelineHandle_t pipe_handle, const char *input_name) {
+ auto &pipeline = (*pipe_handle)->pipeline;
+ return pipeline->InputFeedCount(input_name);
+}
+
+void daliPrefetch(daliPipelineHandle_t pipe_handle) {
+ auto &pipeline = (*pipe_handle)->pipeline;
+ pipeline->Prefetch();
+}
 
 void daliPrefetchUniform(daliPipelineHandle_t pipe_handle, int queue_depth) {
  auto &pipeline = (*pipe_handle)->pipeline;
- for (int i = 0; i < queue_depth; ++i) {
- pipeline->RunCPU();
- pipeline->RunGPU();
+ auto sz = pipeline->GetQueueSizes();
+ if (queue_depth != sz.cpu_size || queue_depth != sz.gpu_size) {
+ DALI_WARN("daliPrefetchUniform is deprecated and setting queue_length different than"
+ " the one set in the pipeline has no effect. Use daliPrefetch instead.");
  }
+ pipeline->Prefetch();
 }
 
 
 void daliPrefetchSeparate(daliPipelineHandle_t pipe_handle,
  int cpu_queue_depth, int gpu_queue_depth) {
  auto &pipeline = (*pipe_handle)->pipeline;
- for (int i = 0; i < gpu_queue_depth; ++i) {
- pipeline->RunCPU();
- pipeline->RunGPU();
- }
- for (int i = 0; i < cpu_queue_depth; ++i) {
- pipeline->RunCPU();
+ auto sz = pipeline->GetQueueSizes();
+ if (cpu_queue_depth != sz.cpu_size || gpu_queue_depth != sz.gpu_size) {
+ DALI_WARN("daliPrefetchSeparate is deprecated and setting queue_length different than"
+ " the one set in the pipeline has no effect. Use daliPrefetch instead.");
  }
+ pipeline->Prefetch();
 }
 
 
@@ -402,8 +413,7 @@ dali_data_type_t daliGetExternalInputType(daliPipelineHandle_t pipe_handle, cons
 
 void daliRun(daliPipelineHandle_t pipe_handle) {
  dali::Pipeline *pipeline = (*pipe_handle)->pipeline.get();
- pipeline->RunCPU();
- pipeline->RunGPU();
+ pipeline->Run();
 }