diff --git a/examples/cuda_vector_add.cu b/examples/cuda_vector_add.cu index c0378d7..db39c67 100644 --- a/examples/cuda_vector_add.cu +++ b/examples/cuda_vector_add.cu @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -25,6 +26,7 @@ constexpr size_t number_repetitions = 20; constexpr size_t max_queue_length = 5; constexpr size_t number_executors = 1; constexpr size_t gpu_id = 0; +constexpr bool in_order_repetitions = true; static_assert(vector_size % entries_per_task == 0); @@ -35,6 +37,47 @@ __global__ void kernel_add(const float_t *input_a, const float_t *input_b, float } int hpx_main(int argc, char *argv[]) { + + /* try { */ + /* boost::program_options::options_description desc{"Options"}; */ + /* desc.add_options()("help", "Help screen")( */ + /* "elements_per_task", */ + /* boost::program_options::value(&array_size) */ + /* ->default_value(5000000), */ + /* "Size of the buffers")( */ + /* "tasks_per_repetition", */ + /* boost::program_options::value(&number_futures) */ + /* ->default_value(64), */ + /* "Sets the number of futures to be (potentially) executed in parallel")( */ + /* "number_repetitions", */ + /* boost::program_options::value(&passes)->default_value(200), */ + /* "Sets the number of repetitions")( */ + /* "outputfile", */ + /* boost::program_options::value(&filename)->default_value( */ + /* ""), */ + /* "Redirect stdout/stderr to this file"); */ + + /* boost::program_options::variables_map vm; */ + /* boost::program_options::parsed_options options = */ + /* parse_command_line(argc, argv, desc); */ + /* boost::program_options::store(options, vm); */ + /* boost::program_options::notify(vm); */ + + /* if (vm.count("help") == 0u) { */ + /* std::cout << "Running with parameters:" << std::endl */ + /* << " --arraysize = " << array_size << std::endl */ + /* << " --futures = " << number_futures << std::endl */ + /* << " --passes = " << passes << std::endl */ + /* << " --hpx:threads = " << hpx::get_os_thread_count() */ + /* << std::endl; */ + /* } else { */ + /* std::cout << desc << std::endl; */ + /* return hpx::finalize(); */ + /* } */ + /* } catch (const boost::program_options::error &ex) { */ + /* std::cerr << "CLI argument problem found: " << ex.what() << '\n'; */ + /* } */ + // HPX and CPPuddle Setup for executor (polling + pool init) // =========================================== 0.a Init HPX CUDA polling hpx::cout << "Start initializing CUDA polling and executor pool..." << std::endl; @@ -52,12 +95,15 @@ int hpx_main(int argc, char *argv[]) { // Launch tasks // Note: Repetitions may be out of order since they do not depend on each other in this toy sample hpx::cout << "Start launching tasks..." << std::endl; + + hpx::shared_future previous_iteration_fut = hpx::make_ready_future(); std::vector> repetition_futs(number_repetitions); for (size_t repetition = 0; repetition < number_repetitions; repetition++) { std::vector> futs(number_tasks); for (size_t task_id = 0; task_id < number_tasks; task_id++) { - futs[task_id] = hpx::async([task_id, &number_cpu_kernel_launches, - &number_gpu_kernel_launches]() { + auto gpu_task_lambda = [](const auto task_id, + auto &number_cpu_kernel_launches, + auto &number_gpu_kernel_launches) { // Inner Task Setup to launch the CUDA kernels: // =========================================== @@ -142,27 +188,63 @@ int hpx_main(int argc, char *argv[]) { // Inner Task Done! // =========================================== - }); + }; + if (in_order_repetitions) { + futs[task_id] = previous_iteration_fut.then([task_id, &number_cpu_kernel_launches, + &number_gpu_kernel_launches, gpu_task_lambda](auto && fut) { + gpu_task_lambda(task_id, number_cpu_kernel_launches, + number_gpu_kernel_launches); + }); + } else { + futs[task_id] = + hpx::async([task_id, &number_cpu_kernel_launches, + &number_gpu_kernel_launches, gpu_task_lambda]() { + gpu_task_lambda(task_id, number_cpu_kernel_launches, + number_gpu_kernel_launches); + }); + } } // Schedule output task to run once a repetition is done auto repetition_finished = hpx::when_all(futs); - repetition_futs.emplace_back(repetition_finished.then([repetition](auto &&fut) { + if (in_order_repetitions) { + previous_iteration_fut = + repetition_finished.then([repetition](auto &&fut) { + hpx::cout << "Repetition " << repetition << " done!" << std::endl; + }); + } else { + repetition_futs.emplace_back( + repetition_finished.then([repetition](auto &&fut) { hpx::cout << "Repetition " << repetition << " done!" << std::endl; })); + } } hpx::cout << "All tasks launched asynchronously!" << std::endl << std::endl; // Schedule output task to run once all other tasks are done - auto all_done_fut = - hpx::when_all(repetition_futs) - .then([&number_cpu_kernel_launches, - &number_gpu_kernel_launches](auto &&fut) { - hpx::cout << "All tasks are done!" << std::endl; - hpx::cout << " => " << number_gpu_kernel_launches - << " kernels were run on the GPU" << std::endl; - hpx::cout << " => " << number_cpu_kernel_launches - << " kernels were using the CPU fallback" << std::endl << std::endl; - }); - all_done_fut.get(); + if (in_order_repetitions) { + previous_iteration_fut + .then([&number_cpu_kernel_launches, + &number_gpu_kernel_launches](auto &&fut) { + hpx::cout << "All tasks are done! [in-order repetitions version]" << std::endl; + hpx::cout << " => " << number_gpu_kernel_launches + << " kernels were run on the GPU" << std::endl; + hpx::cout << " => " << number_cpu_kernel_launches + << " kernels were using the CPU fallback" << std::endl + << std::endl; + }) + .get(); + } else { + hpx::when_all(repetition_futs) + .then([&number_cpu_kernel_launches, + &number_gpu_kernel_launches](auto &&fut) { + hpx::cout << "All tasks are done! [out-of-order repetitions version]" << std::endl; + hpx::cout << " => " << number_gpu_kernel_launches + << " kernels were run on the GPU" << std::endl; + hpx::cout << " => " << number_cpu_kernel_launches + << " kernels were using the CPU fallback" << std::endl + << std::endl; + }) + .get(); + } hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0)); hpx::cout << "Finalizing..." << std::endl;