-
Notifications
You must be signed in to change notification settings - Fork 27
/
multigpu_saxpy.cu
100 lines (72 loc) · 3.24 KB
/
multigpu_saxpy.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#include <cassert>
#include <iostream>
#include <chrono>
#include <agency/agency.hpp>
#include <agency/cuda.hpp>
#include <agency/cuda/experimental.hpp>
#include <agency/experimental.hpp>
template<class Executor, class View>
void saxpy(Executor& exec, size_t n, float a, View x, View y, View z)
{
using namespace agency;
bulk_invoke(par(n).on(exec), [=] __host__ __device__ (agency::parallel_agent& self)
{
int i = self.index();
z[i] = a * x[i] + y[i];
});
}
template<class Container, class Executor>
double time_saxpy()
{
using agency::experimental::all;
Executor exec;
size_t n = 32 << 20;
Container x(n, 1.f), y(n, 2.f), z(n);
float a = 13.;
saxpy(exec, n, a, all(x), all(y), all(z));
{
// put this in its own scope so that ref's storage gets deallocated
// before we measure performance
Container ref(n, a * 1.f + 2.f);
assert(ref == z);
}
// ensure everything migrates back to the GPU before timing
saxpy(exec, n, a, all(x), all(y), all(z));
// time a number of trials
size_t num_trials = 20;
auto start = std::chrono::high_resolution_clock::now();
for(size_t i = 0; i < num_trials; ++i)
{
saxpy(exec, n, a, all(x), all(y), all(z));
}
std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
double seconds = elapsed.count() / num_trials;
double gigabytes = double(3 * n * sizeof(float)) / (1 << 30);
return gigabytes / seconds;
}
int main()
{
// the right choice of container can make a huge difference to performance
// a contiguous container such as std::vector is laid out in a single contiguous allocation
using contiguous_container = std::vector<float, agency::cuda::managed_allocator<float>>;
// a segmented container such as multidevice_array is pieced together from separate, contiguous allocations
using segmented_container = agency::cuda::experimental::multidevice_array<float>;
// let's first measure the performance of SAXPY on a single GPU using a contiguous container
using gpu_executor = agency::cuda::grid_executor;
std::cout << "Measuring the performance of gpu SAXPY using contiguous containers..." << std::endl;
auto singlegpu_bandwidth = time_saxpy<contiguous_container, gpu_executor>();
std::cout << "SAXPY Bandwidth: " << singlegpu_bandwidth << " GB/s" << std::endl;
std::cout << std::endl;
// now let's use multiple gpus at once to execute SAXPY and see how the choice of container impacts performance
using multigpu_executor = agency::cuda::multidevice_executor;
std::cout << "Measuring the performance of multigpu SAXPY using contiguous containers..." << std::endl;
auto multigpu_contiguous_bandwidth = time_saxpy<contiguous_container, multigpu_executor>();
std::cout << "SAXPY Bandwidth: " << multigpu_contiguous_bandwidth << " GB/s" << std::endl;
std::cout << std::endl;
std::cout << "Measuring the performance of multigpu SAXPY using segmented containers..." << std::endl;
auto multigpu_segmented_bandwidth = time_saxpy<segmented_container, multigpu_executor>();
std::cout << "SAXPY Bandwidth: " << multigpu_segmented_bandwidth << " GB/s" << std::endl;
std::cout << std::endl;
std::cout << "Multigpu speedup: " << multigpu_segmented_bandwidth / singlegpu_bandwidth << "x" << std::endl;
return 0;
}