diff --git a/include/boost/compute/distributed/copy.hpp b/include/boost/compute/distributed/copy.hpp index 6875de8c9..bd18226a0 100644 --- a/include/boost/compute/distributed/copy.hpp +++ b/include/boost/compute/distributed/copy.hpp @@ -27,12 +27,20 @@ #include #include -#include +#include namespace boost { namespace compute { namespace distributed { +// forward declaration for distributed::vector +template< + class T, + weight_func weight, + class Alloc +> +class vector; + namespace detail { template diff --git a/include/boost/compute/distributed/detail/weight_func.hpp b/include/boost/compute/distributed/detail/weight_func.hpp new file mode 100644 index 000000000..31355469e --- /dev/null +++ b/include/boost/compute/distributed/detail/weight_func.hpp @@ -0,0 +1,79 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2016 Jakub Szuppe +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_DETAIL_WEIGHT_FUNC_HPP +#define BOOST_COMPUTE_DETAIL_WEIGHT_FUNC_HPP + +#include + +#include +#include + +namespace boost { +namespace compute { +namespace distributed { + +typedef std::vector (*weight_func)(const command_queue&); + +namespace detail { + +/// \internal_ +/// Rounds up \p n to the nearest multiple of \p m. +/// Note: \p m must be a multiple of 2. +size_t round_up(size_t n, size_t m) +{ + assert(m && ((m & (m -1)) == 0)); + return (n + m - 1) & ~(m - 1); +} + +/// \internal_ +/// +std::vector partition(const command_queue& queue, + weight_func weight_func, + const size_t size, + const size_t align) +{ + std::vector weights = weight_func(queue); + std::vector partition; + partition.reserve(queue.size() + 1); + partition.push_back(0); + + if(queue.size() > 1) + { + double acc = 0; + for(size_t i = 0; i < queue.size(); i++) + { + acc += weights[i]; + partition.push_back( + std::min( + size, + round_up(size * acc, align) + ) + ); + } + return partition; + } + partition.push_back(size); + return partition; +} + +} // end distributed detail + +std::vector default_weight_func(const command_queue& queue) +{ + return std::vector(queue.size(), 1.0/queue.size()); +} + +} // end distributed namespace +} // end compute namespace +} // end boost namespace + + +#endif /* INCLUDE_BOOST_COMPUTE_DETAIL_WEIGHT_FUNC_HPP_ */ diff --git a/include/boost/compute/distributed/vector.hpp b/include/boost/compute/distributed/vector.hpp index 30e48bbf5..de2e9c4fe 100644 --- a/include/boost/compute/distributed/vector.hpp +++ b/include/boost/compute/distributed/vector.hpp @@ -33,8 +33,6 @@ #include #include #include -#include // ? -#include // ? #include #include #include @@ -44,62 +42,13 @@ #include #include +#include +#include namespace boost { namespace compute { namespace distributed { -typedef std::vector (*weight_func)(const command_queue&); - -namespace detail { - -/// \internal_ -/// Rounds up \p n to the nearest multiple of \p m. -/// Note: \p m must be a multiple of 2. -size_t round_up(size_t n, size_t m) -{ - assert(m && ((m & (m -1)) == 0)); - return (n + m - 1) & ~(m - 1); -} - -/// \internal_ -/// -std::vector partition(const command_queue& queue, - weight_func weight_func, - const size_t size, - const size_t align) -{ - std::vector weights = weight_func(queue); - std::vector partition; - partition.reserve(queue.size() + 1); - partition.push_back(0); - - if(queue.size() > 1) - { - double acc = 0; - for(size_t i = 0; i < queue.size(); i++) - { - acc += weights[i]; - partition.push_back( - std::min( - size, - round_up(size * acc, align) - ) - ); - } - return partition; - } - partition.push_back(size); - return partition; -} - -} // end distributed detail - -std::vector default_weight_func(const command_queue& queue) -{ - return std::vector(queue.size(), 1.0/queue.size()); -} - /// \class vector /// \brief A resizable array of values allocated across multiple devices. /// @@ -131,7 +80,6 @@ class vector : m_queue(context), m_size(0) { - // TODO lazy allocation? for(size_t i = 0; i < m_queue.size(); i++) { m_allocators.push_back(Alloc(context.get(i))); @@ -163,32 +111,34 @@ class vector /// \endcode vector(size_type count, const T &value, - command_queue &queue, - bool blocking = false) + command_queue &queue) : m_queue(queue), m_size(count) { allocate_memory(m_size); - wait_list events; + std::vector events; events.reserve(m_data.size()); for(size_t i = 0; i < m_data.size(); i++) { - events.safe_insert( + event e = ::boost::compute::fill_async( begin(i), end(i), value, queue.get(i) - ) - ); + ).get_event(); + if(e.get()) { + events.push_back(e); + } } - if(blocking) { - events.wait(); + for(size_t i = 0; i < events.size(); i++) { + events[i].wait(); } } /// Creates a vector with space for the values in the range [\p first, - /// \p last) and copies them into the vector with \p queue. + /// \p last) allocated on the host and copies them into the vector + /// with \p queue. /// /// For example: /// \code @@ -198,83 +148,116 @@ class vector /// // create a vector of size four and copy the values from data /// boost::compute::distributed::vector vec(data, data + 4, queue); /// \endcode - template - vector(InputIterator first, - InputIterator last, + template + vector(HostIterator first, + HostIterator last, command_queue &queue, - bool blocking = false) + typename boost::enable_if_c< + !is_device_iterator::value + >::type* = 0) : m_queue(queue), m_size(::boost::compute::detail::iterator_range_size(first, last)) { allocate_memory(m_size); - copy(first, last, m_queue, blocking); + ::boost::compute::distributed::copy(first, last, *this, m_queue); + } + + /// Creates a vector with space for the values in the range [\p first, + /// \p last) allocated on an OpenCL device and copies them into the vector + /// with \p queue. + template + vector(DeviceIterator first, + DeviceIterator last, + ::boost::compute::command_queue &device_queue, + command_queue &distributed_queue, + typename boost::enable_if_c< + is_device_iterator::value + >::type* = 0) + : m_queue(distributed_queue), + m_size(::boost::compute::detail::iterator_range_size(first, last)) + { + allocate_memory(m_size); + ::boost::compute::distributed::copy( + first, last, *this, device_queue, m_queue + ); } /// Creates a new vector and copies the values from \p other. - explicit vector(const vector &other, bool blocking = false) + explicit vector(const vector &other) : m_queue(other.m_queue), m_size(other.m_size) { allocate_memory(m_size); - copy(other, m_queue, blocking); + ::boost::compute::distributed::copy( + other, *this, m_queue + ); } /// Creates a new vector and copies the values from \p other /// with \p queue. - vector(const vector &other, command_queue &queue, bool blocking = false) + vector(const vector &other, command_queue &queue) : m_queue(queue), m_size(other.m_size) { allocate_memory(m_size); if(m_queue == other.m_queue) { - copy(other, m_queue, blocking); + ::boost::compute::distributed::copy( + other, *this, m_queue + ); } else { command_queue other_queue = other.get_queue(); - copy(other, other_queue, m_queue); + ::boost::compute::distributed::copy( + other, *this, other_queue, m_queue + ); } } /// Creates a new vector and copies the values from \p other /// with \p queue. template - vector(const vector &other, - bool blocking = false) + vector(const vector &other) : m_queue(other.m_queue), m_size(other.m_size) { allocate_memory(m_size); - copy(other, m_queue, blocking); + ::boost::compute::distributed::copy( + other, *this, m_queue + ); } /// Creates a new vector and copies the values from \p other. template vector(const vector &other, - command_queue &queue, - bool blocking = false) + command_queue &queue) : m_queue(queue), m_size(other.size()) { allocate_memory(m_size); if(m_queue == other.get_queue()) { - copy(other, m_queue, blocking); + ::boost::compute::distributed::copy( + other, *this, m_queue + ); } else { command_queue other_queue = other.get_queue(); - copy(other, other_queue, m_queue); + ::boost::compute::distributed::copy( + other, *this, other_queue, m_queue + ); } } /// Creates a new vector and copies the values from \p vector. template vector(const std::vector &vector, - command_queue &queue, - bool blocking = false) + command_queue &queue) : m_queue(queue), m_size(vector.size()) { allocate_memory(m_size); - copy(vector.begin(), vector.end(), m_queue, blocking); + ::boost::compute::distributed::copy( + vector.begin(), vector.end(), *this, m_queue + ); } /// Copy assignment. This operation is always non-blocking. @@ -284,7 +267,7 @@ class vector m_queue = other.m_queue; m_size = other.m_size; allocate_memory(m_size); - copy(other, m_queue, false); + ::boost::compute::distributed::copy(other, *this, m_queue); } return *this; } @@ -296,7 +279,7 @@ class vector m_queue = other.get_queue(); m_size = other.size(); allocate_memory(m_size); - copy(other, m_queue, false); + ::boost::compute::distributed::copy(other, *this, m_queue); return *this; } @@ -306,7 +289,9 @@ class vector { m_size = vector.size(); allocate_memory(m_size); - copy(vector.begin(), vector.end(), m_queue, false); + ::boost::compute::distributed::copy( + vector.begin(), vector.end(), *this, m_queue + ); return *this; } @@ -483,7 +468,6 @@ class vector /// Removes all elements from the vector. void clear() { - //TODO: ??? m_size = 0; } @@ -557,121 +541,6 @@ class vector } } - // host -> device - template - inline wait_list - copy_async(Iterator first, - Iterator last, - command_queue &queue, - typename boost::enable_if_c< - !is_device_iterator::value - >::type* = 0) - { - typedef typename Iterator::difference_type diff_type; - wait_list events; - events.reserve(m_data.size()); - - Iterator part_first = first; - Iterator part_end = first; - for(size_t i = 0; i < m_data.size(); i++) - { - part_end = (std::min)( - part_end + static_cast(m_data_sizes[i]), - last - ); - events.safe_insert( - ::boost::compute::copy_async( - part_first, - part_end, - begin(i), - queue.get(i) - ) - ); - part_first = part_end; - } - return events; - } - - // host -> device - template - inline void - copy(Iterator first, - Iterator last, - command_queue &queue, - bool blocking, - typename boost::enable_if_c< - !is_device_iterator::value - >::type* = 0) - { - if(blocking) { - copy_async(first, last, queue).wait(); - } else { - copy_async(first, last, queue); - } - } - - // device -> device (copying distributed vector) - // both vectors must have the same command_queue - template - inline wait_list - copy_async(const vector &other, command_queue &queue) - { - wait_list events; - events.reserve(m_data.size()); - for(size_t i = 0; i < m_data.size(); i++) - { - events.safe_insert( - ::boost::compute::copy_async( - other.begin(i), - other.end(i), - begin(i), - queue.get(i) - ) - ); - } - return events; - } - - // device -> device (copying distributed vector) - // both vectors must have the same command_queue - template - inline void - copy(const vector &other, command_queue &queue, bool blocking) - { - if(blocking) { - copy_async(other, queue).wait(); - } else { - copy_async(other, queue); - } - } - - // device -> device (copying distributed vector) - template - inline void - copy(const vector &other, - command_queue &other_queue, - command_queue &queue) - { - wait_list events; - events.reserve(m_data.size()); - std::vector host(other.size()); - typename std::vector::iterator host_iter = host.begin(); - for(size_t i = 0; i < other.parts(); i++) - { - events.safe_insert( - ::boost::compute::copy_async( - other.begin(i), - other.end(i), - host_iter, - other_queue.get(i) - ) - ); - host_iter += other.part_size(i); - } - events.wait(); - copy_async(host.begin(), host.end(), queue).wait(); - } - private: command_queue m_queue; size_type m_size; diff --git a/test/test_distributed_vector.cpp b/test/test_distributed_vector.cpp index 7f2e5f3ef..13ec21ae1 100644 --- a/test/test_distributed_vector.cpp +++ b/test/test_distributed_vector.cpp @@ -79,9 +79,6 @@ BOOST_AUTO_TEST_CASE(command_queue_ctor) bc::distributed::vector distributed_vector( size_t(35), value, distributed_queue ); - bc::distributed::vector distributed_vector_blocking( - size_t(35), value, distributed_queue, true - ); BOOST_CHECK(!distributed_vector.empty()); BOOST_CHECK(distributed_vector.size() == 35); @@ -110,14 +107,6 @@ BOOST_AUTO_TEST_CASE(command_queue_ctor) distributed_queue.get(i) ) ); - BOOST_CHECK( - bc::equal( - distributed_vector_blocking.begin(i), - distributed_vector_blocking.begin(i), - bc::make_constant_iterator(value), - distributed_queue.get(i) - ) - ); } } @@ -134,9 +123,6 @@ BOOST_AUTO_TEST_CASE(host_iterator_ctor) bc::distributed::vector distributed_vector( host_vector.begin(), host_vector.end(), distributed_queue ); - bc::distributed::vector distributed_vector_blocking( - host_vector.begin(), host_vector.end(), distributed_queue, true - ); BOOST_CHECK(!distributed_vector.empty()); BOOST_CHECK(distributed_vector.size() == host_vector.size()); @@ -159,14 +145,6 @@ BOOST_AUTO_TEST_CASE(host_iterator_ctor) distributed_queue.get(i) ) ); - BOOST_CHECK( - bc::equal( - distributed_vector_blocking.begin(i), - distributed_vector_blocking.begin(i), - bc::make_constant_iterator(value), - distributed_queue.get(i) - ) - ); } // need to finish since back() and front() @@ -192,19 +170,19 @@ BOOST_AUTO_TEST_CASE(copy_ctor) size_t size = 64; bc::distributed::vector distributed_vector( - size, value, distributed_queue1, true + size, value, distributed_queue1 ); bc::distributed::vector distributed_vector_copy1( - distributed_vector, true + distributed_vector ); bc::distributed::vector distributed_vector_copy2( - distributed_vector, distributed_queue2, true + distributed_vector, distributed_queue2 ); bc::distributed::vector< bc::int_, bc::distributed::default_weight_func, bc::pinned_allocator > distributed_vector_copy3( - distributed_vector, distributed_queue2, true + distributed_vector, distributed_queue2 ); for(size_t i = 0; i < distributed_vector.parts(); i++)