diff --git a/examples/cuda_vector_add.cu b/examples/cuda_vector_add.cu
index c0378d7..db39c67 100644
--- a/examples/cuda_vector_add.cu
+++ b/examples/cuda_vector_add.cu
@@ -5,6 +5,7 @@
 #include <hpx/include/iostreams.hpp>
 #include <hpx/async_cuda/cuda_executor.hpp>
 
+#include <boost/program_options.hpp>
 
 #include <cppuddle/memory_recycling/std_recycling_allocators.hpp>
 #include <cppuddle/memory_recycling/cuda_recycling_allocators.hpp>
@@ -25,6 +26,7 @@ constexpr size_t number_repetitions = 20;
 constexpr size_t max_queue_length = 5;
 constexpr size_t number_executors = 1;
 constexpr size_t gpu_id = 0;
+constexpr bool in_order_repetitions = true;
 
 static_assert(vector_size % entries_per_task == 0);
 
@@ -35,6 +37,47 @@ __global__ void kernel_add(const float_t *input_a, const float_t *input_b, float
 }
 
 int hpx_main(int argc, char *argv[]) {
+
+  /* try { */
+  /*   boost::program_options::options_description desc{"Options"}; */
+  /*   desc.add_options()("help", "Help screen")( */
+  /*       "elements_per_task", */
+  /*       boost::program_options::value<size_t>(&array_size) */
+  /*           ->default_value(5000000), */
+  /*       "Size of the buffers")( */
+  /*       "tasks_per_repetition", */
+  /*       boost::program_options::value<size_t>(&number_futures) */
+  /*           ->default_value(64), */
+  /*       "Sets the number of futures to be (potentially) executed in parallel")( */
+  /*       "number_repetitions", */
+  /*       boost::program_options::value<size_t>(&passes)->default_value(200), */
+  /*       "Sets the number of repetitions")( */
+  /*       "outputfile", */
+  /*       boost::program_options::value<std::string>(&filename)->default_value( */
+  /*           ""), */
+  /*       "Redirect stdout/stderr to this file"); */
+
+  /*   boost::program_options::variables_map vm; */
+  /*   boost::program_options::parsed_options options = */
+  /*       parse_command_line(argc, argv, desc); */
+  /*   boost::program_options::store(options, vm); */
+  /*   boost::program_options::notify(vm); */
+
+  /*   if (vm.count("help") == 0u) { */
+  /*     std::cout << "Running with parameters:" << std::endl */
+  /*               << " --arraysize = " << array_size << std::endl */
+  /*               << " --futures =  " << number_futures << std::endl */
+  /*               << " --passes = " << passes << std::endl */
+  /*               << " --hpx:threads = " << hpx::get_os_thread_count() */
+  /*               << std::endl; */
+  /*   } else { */
+  /*     std::cout << desc << std::endl; */
+  /*     return hpx::finalize(); */
+  /*   } */
+  /* } catch (const boost::program_options::error &ex) { */
+  /*   std::cerr << "CLI argument problem found: " << ex.what() << '\n'; */
+  /* } */
+
   // HPX and CPPuddle Setup for executor (polling + pool init)
   // =========================================== 0.a Init HPX CUDA polling
   hpx::cout << "Start initializing CUDA polling and executor pool..." << std::endl;
@@ -52,12 +95,15 @@ int hpx_main(int argc, char *argv[]) {
   // Launch tasks 
   // Note: Repetitions may be out of order since they do not depend on each other in this toy sample
   hpx::cout << "Start launching tasks..." << std::endl;
+
+  hpx::shared_future<void> previous_iteration_fut = hpx::make_ready_future<void>();
   std::vector<hpx::future<void>> repetition_futs(number_repetitions);
   for (size_t repetition = 0; repetition < number_repetitions; repetition++) {
     std::vector<hpx::future<void>> futs(number_tasks);
     for (size_t task_id = 0; task_id < number_tasks; task_id++) {
-      futs[task_id] = hpx::async([task_id, &number_cpu_kernel_launches,
-                                  &number_gpu_kernel_launches]() {
+      auto gpu_task_lambda = [](const auto task_id,
+                                auto &number_cpu_kernel_launches,
+                                auto &number_gpu_kernel_launches) {
         // Inner Task Setup to launch the CUDA kernels:
         // ===========================================
 
@@ -142,27 +188,63 @@ int hpx_main(int argc, char *argv[]) {
 
         // Inner Task Done!
         // ===========================================
-      });
+      };
+      if (in_order_repetitions) {
+        futs[task_id] =  previous_iteration_fut.then([task_id, &number_cpu_kernel_launches,
+                        &number_gpu_kernel_launches, gpu_task_lambda](auto && fut) {
+              gpu_task_lambda(task_id, number_cpu_kernel_launches,
+                              number_gpu_kernel_launches);
+            });
+      } else {
+        futs[task_id] =
+            hpx::async([task_id, &number_cpu_kernel_launches,
+                        &number_gpu_kernel_launches, gpu_task_lambda]() {
+              gpu_task_lambda(task_id, number_cpu_kernel_launches,
+                              number_gpu_kernel_launches);
+            });
+      }
     }
     // Schedule output task to run once a repetition is done
     auto repetition_finished = hpx::when_all(futs);
-    repetition_futs.emplace_back(repetition_finished.then([repetition](auto &&fut) {
+    if (in_order_repetitions) {
+      previous_iteration_fut =
+          repetition_finished.then([repetition](auto &&fut) {
+            hpx::cout << "Repetition " << repetition << " done!" << std::endl;
+          });
+    } else {
+      repetition_futs.emplace_back(
+          repetition_finished.then([repetition](auto &&fut) {
             hpx::cout << "Repetition " << repetition << " done!" << std::endl;
           }));
+    }
   }
   hpx::cout << "All tasks launched asynchronously!" << std::endl << std::endl;
   // Schedule output task to run once all other tasks are done
-  auto all_done_fut =
-      hpx::when_all(repetition_futs)
-          .then([&number_cpu_kernel_launches,
-                 &number_gpu_kernel_launches](auto &&fut) {
-            hpx::cout << "All tasks are done!" << std::endl;
-            hpx::cout << " => " << number_gpu_kernel_launches
-                      << " kernels were run on the GPU" << std::endl;
-            hpx::cout << " => " << number_cpu_kernel_launches
-                      << " kernels were using the CPU fallback" << std::endl << std::endl;
-          });
-  all_done_fut.get();
+  if (in_order_repetitions) {
+    previous_iteration_fut
+        .then([&number_cpu_kernel_launches,
+               &number_gpu_kernel_launches](auto &&fut) {
+          hpx::cout << "All tasks are done! [in-order repetitions version]" << std::endl;
+          hpx::cout << " => " << number_gpu_kernel_launches
+                    << " kernels were run on the GPU" << std::endl;
+          hpx::cout << " => " << number_cpu_kernel_launches
+                    << " kernels were using the CPU fallback" << std::endl
+                    << std::endl;
+        })
+        .get();
+  } else {
+    hpx::when_all(repetition_futs)
+        .then([&number_cpu_kernel_launches,
+               &number_gpu_kernel_launches](auto &&fut) {
+          hpx::cout << "All tasks are done! [out-of-order repetitions version]" << std::endl;
+          hpx::cout << " => " << number_gpu_kernel_launches
+                    << " kernels were run on the GPU" << std::endl;
+          hpx::cout << " => " << number_cpu_kernel_launches
+                    << " kernels were using the CPU fallback" << std::endl
+                    << std::endl;
+        })
+        .get();
+  }
 
   hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0));
   hpx::cout << "Finalizing..." << std::endl;