From 9a578b7ad4290fba0f1fad998ab26c64cedcfd93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Pays?= Date: Sat, 17 Nov 2018 16:17:35 +0100 Subject: [PATCH] OpenCL: replace thread_local with a resource pool. (#516) * opencl: replace thread_local with a resource pool. * Local variables clean-up. * clang-format with -style=google * Removing compiler warnings. * Local variables naming fix. * Removed a no longer used mutex. * Fix XgemmBatched/Xgemv for retrieving wavefront size. * Fixing OpenCLBuffers ctor (const ref). * Fixing OpenCLComputation ctor (const ref). --- meson.build | 1 + src/neural/opencl/OpenCL.cc | 458 ++-------------------------- src/neural/opencl/OpenCL.h | 65 +--- src/neural/opencl/OpenCLBuffers.cc | 407 ++++++++++++++++++++++++ src/neural/opencl/OpenCLBuffers.h | 86 ++++++ src/neural/opencl/network_opencl.cc | 17 +- 6 files changed, 544 insertions(+), 490 deletions(-) create mode 100644 src/neural/opencl/OpenCLBuffers.cc create mode 100644 src/neural/opencl/OpenCLBuffers.h diff --git a/meson.build b/meson.build index fe4819004f..c5a6a907c3 100644 --- a/meson.build +++ b/meson.build @@ -291,6 +291,7 @@ if get_option('build_backends') opencl_files = [ 'src/neural/opencl/OpenCL.cc', 'src/neural/opencl/OpenCLTuner.cc', + 'src/neural/opencl/OpenCLBuffers.cc', 'src/neural/opencl/network_opencl.cc', ] diff --git a/src/neural/opencl/OpenCL.cc b/src/neural/opencl/OpenCL.cc index 31aba93d1b..994db354ab 100644 --- a/src/neural/opencl/OpenCL.cc +++ b/src/neural/opencl/OpenCL.cc @@ -80,26 +80,6 @@ const std::string sourceCode_sgemv = #include "clblast_level3/xgemv.opencl" ; -thread_local ThreadData opencl_thread_data; - -void OpenCL::ensure_thread_initialized() { - if (!opencl_thread_data.m_is_initialized) { - // Make kernels - opencl_thread_data.m_convolve1_kernel = cl::Kernel(m_program, "convolve1"); - opencl_thread_data.m_merge_kernel = cl::Kernel(m_program, "merge_bn"); - opencl_thread_data.m_in_transform_kernel = - cl::Kernel(m_program, "in_transform"); - opencl_thread_data.m_sgemm_kernel = cl::Kernel(m_program, "XgemmBatched"); - opencl_thread_data.m_out_transform_bn_kernel = - cl::Kernel(m_program, "out_transform_fused_bn"); - opencl_thread_data.m_out_transform_bn_in_kernel = - cl::Kernel(m_program, "out_transform_fused_bn_in"); - opencl_thread_data.m_sgemv_kernel = cl::Kernel(m_program, "Xgemv"); - opencl_thread_data.m_commandqueue = cl::CommandQueue(m_context, m_device); - opencl_thread_data.m_is_initialized = true; - } -} - void OpenCL_Network::add_weights(size_t layer, size_t size, const float* weights) { if (layer >= m_layers.size()) { @@ -117,408 +97,6 @@ void OpenCL_Network::add_weights(size_t layer, size_t size, const_cast(converted_weights.data())); } -void OpenCL_Network::forward(const std::vector& input, - std::vector& output_pol, - std::vector& output_val, - const int batch_size) const { - constexpr auto tiles = WINOGRAD_P; - constexpr auto width = 8; - constexpr auto height = 8; - - auto finalSize_pol = - m_layers[m_layers.size() - 2].ip_out_size * sizeof(net_t); - auto finalSize_val = m_layers.back().ip_out_size * sizeof(net_t); - - if (m_layers.back().is_policy) { - std::swap(finalSize_pol, finalSize_val); - } - - m_opencl.ensure_thread_initialized(); - - if (!opencl_thread_data.m_buffers_allocated) { - auto max_channels = unsigned{0}; - for (const auto& layer : m_layers) { - max_channels = - std::max(max_channels, std::max(layer.channels, layer.outputs)); - } - - const auto mwg = m_opencl.m_sgemm_tuners.mwg; - const auto nwg = m_opencl.m_sgemm_tuners.nwg; - const auto vwm = m_opencl.m_sgemm_tuners.vwm; - const auto vwn = m_opencl.m_sgemm_tuners.vwn; - - const auto m_ceil = ceilMultiple(ceilMultiple(max_channels, mwg), vwm); - const auto n_ceil = ceilMultiple(ceilMultiple(tiles, nwg), vwn); - - const auto max_batch_size = getMaxMatchSize(); - const auto alloc_inSize = - max_batch_size * width * height * max_channels * sizeof(net_t); - const auto alloc_vm_size = - max_batch_size * WINOGRAD_TILE * m_ceil * n_ceil * sizeof(net_t); - - auto v_zeros = std::vector(alloc_vm_size); - - opencl_thread_data.m_inBuffer = - cl::Buffer(m_opencl.m_context, CL_MEM_READ_WRITE, alloc_inSize); - opencl_thread_data.m_inBuffer2 = - cl::Buffer(m_opencl.m_context, CL_MEM_READ_WRITE, alloc_inSize); - opencl_thread_data.m_VBuffer = cl::Buffer( - m_opencl.m_context, - CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, - alloc_vm_size, v_zeros.data(), nullptr); - opencl_thread_data.m_MBuffer = - cl::Buffer(m_opencl.m_context, - CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, alloc_vm_size); - - opencl_thread_data.m_pinnedOutBuffer_pol = cl::Buffer( - m_opencl.m_context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, - max_batch_size * finalSize_pol); - opencl_thread_data.m_pinnedOutBuffer_val = cl::Buffer( - m_opencl.m_context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, - max_batch_size * finalSize_val); - - opencl_thread_data.m_buffers_allocated = true; - } - - cl::Buffer& inBuffer = opencl_thread_data.m_inBuffer; - cl::Buffer& inBuffer2 = opencl_thread_data.m_inBuffer2; - cl::Buffer& VBuffer = opencl_thread_data.m_VBuffer; - cl::Buffer& MBuffer = opencl_thread_data.m_MBuffer; - cl::CommandQueue& queue = opencl_thread_data.m_commandqueue; - - const auto inSize = sizeof(net_t) * input.size(); - queue.enqueueWriteBuffer(inBuffer, CL_FALSE, 0, inSize, input.data()); - - auto skip_in_trans = false; - for (auto iter = cbegin(m_layers); iter != cend(m_layers); iter++) { - const auto& layer = *iter; - const auto niter = std::next(iter); - - if (layer.is_input_convolution) { - assert(niter != cend(m_layers)); - auto conv_weights = begin(layer.weights); - auto bn_weights = begin(layer.weights) + 1; - auto skip_next_in_trans = false; - if (niter->is_residual_block) { - skip_next_in_trans = true; - } - convolve3(layer.channels, layer.outputs, inBuffer, inBuffer, VBuffer, - MBuffer, conv_weights, nullptr, bn_weights, skip_in_trans, - skip_next_in_trans, true, batch_size); - skip_in_trans = skip_next_in_trans; - } else if (layer.is_residual_block) { - assert(layer.channels == layer.outputs); - assert(niter != cend(m_layers)); - auto conv1_weights = begin(layer.weights); - auto bn1_weights = begin(layer.weights) + 1; - auto conv2_weights = begin(layer.weights) + 3; - auto bn2_weights = begin(layer.weights) + 4; - convolve3(layer.channels, layer.outputs, inBuffer, inBuffer2, VBuffer, - MBuffer, conv1_weights, nullptr, bn1_weights, skip_in_trans, - true, false, batch_size); - - auto skip_next_in_trans = false; - if (niter->is_residual_block) { - skip_next_in_trans = true; - } - convolve3(layer.channels, layer.outputs, inBuffer2, inBuffer, VBuffer, - MBuffer, conv2_weights, &inBuffer, bn2_weights, true, - skip_next_in_trans, true, batch_size); - skip_in_trans = skip_next_in_trans; - } else { - assert(layer.is_value || layer.is_policy); - - cl::Buffer out_buffer; - if (layer.is_policy) { - out_buffer = opencl_thread_data.m_pinnedOutBuffer_pol; - } else { - out_buffer = opencl_thread_data.m_pinnedOutBuffer_val; - } - - auto ip_w = begin(layer.weights) + 3; - auto ip_b = begin(layer.weights) + 4; - - convolve1(layer.channels, layer.outputs, inBuffer, inBuffer2, VBuffer, - begin(layer.weights), batch_size); - - innerproduct(inBuffer2, ip_w, ip_b, out_buffer, layer.ip_in_size, - layer.ip_out_size, layer.is_value, batch_size); - } - } - - auto pinnedOutBufferHost_pol = - queue.enqueueMapBuffer(opencl_thread_data.m_pinnedOutBuffer_pol, CL_FALSE, - CL_MAP_READ, 0, batch_size * finalSize_pol); - auto pinnedOutBufferHost_val = - queue.enqueueMapBuffer(opencl_thread_data.m_pinnedOutBuffer_val, CL_FALSE, - CL_MAP_READ, 0, batch_size * finalSize_val); - - { - // Finish call is usually a busy wait. When using multiple threads, - // use the lock to avoid busy waiting with all threads. - std::lock_guard lock(m_queue_finish_mutex); - queue.finish(); - } - - std::memcpy(output_pol.data(), pinnedOutBufferHost_pol, - batch_size * finalSize_pol); - std::memcpy(output_val.data(), pinnedOutBufferHost_val, - batch_size * finalSize_val); - - queue.enqueueUnmapMemObject(opencl_thread_data.m_pinnedOutBuffer_pol, - pinnedOutBufferHost_pol); - queue.enqueueUnmapMemObject(opencl_thread_data.m_pinnedOutBuffer_val, - pinnedOutBufferHost_val); -} - -void OpenCL_Network::convolve3(int channels, int outputs, cl::Buffer& bufferIn, - cl::Buffer& bufferOut, cl::Buffer& bufferV, - cl::Buffer& bufferM, weight_slice_t weights, - cl::Buffer* bufferResidual, - weight_slice_t bn_weights, - bool skip_in_transform, bool fuse_in_transform, - bool store_inout, int batch_size) const { - cl::Kernel& in_transform_kernel = opencl_thread_data.m_in_transform_kernel; - cl::Kernel& sgemm_kernel = opencl_thread_data.m_sgemm_kernel; - cl::Kernel& out_transform_bn_kernel = - opencl_thread_data.m_out_transform_bn_kernel; - cl::Kernel& out_transform_bn_in_kernel = - opencl_thread_data.m_out_transform_bn_in_kernel; - - auto mwg = m_opencl.m_sgemm_tuners.mwg; - auto nwg = m_opencl.m_sgemm_tuners.nwg; - auto kwg = m_opencl.m_sgemm_tuners.kwg; - auto vwm = m_opencl.m_sgemm_tuners.vwm; - auto vwn = m_opencl.m_sgemm_tuners.vwn; - auto mdimc = m_opencl.m_sgemm_tuners.mdimc; - auto ndimc = m_opencl.m_sgemm_tuners.ndimc; - auto wavefront_size = m_opencl.m_wavefront_size; - - assert(mwg != 0); - assert(nwg != 0); - assert(kwg != 0); - assert(mdimc != 0); - assert(ndimc != 0); - assert(vwm != 0); - assert(vwn != 0); - assert(wavefront_size != 0); - - constexpr auto tiles = WINOGRAD_P; - constexpr auto width = 8; - constexpr auto height = 8; - - auto wgs = ceilMultiple(tiles, wavefront_size); - auto m_ceil = int(ceilMultiple(ceilMultiple(outputs, mwg), vwm)); - auto n_ceil = int(ceilMultiple(ceilMultiple(batch_size * tiles, nwg), vwn)); - auto k_ceil = int(ceilMultiple(ceilMultiple(channels, kwg), vwm)); - - cl::CommandQueue& queue = opencl_thread_data.m_commandqueue; - - if (!skip_in_transform) { - try { - in_transform_kernel.setArg(0, bufferIn); - in_transform_kernel.setArg(1, bufferV); - in_transform_kernel.setArg(2, channels); - in_transform_kernel.setArg(3, k_ceil); - in_transform_kernel.setArg(4, n_ceil); - - queue.enqueueNDRangeKernel(in_transform_kernel, cl::NullRange, - cl::NDRange(wgs, channels, batch_size)); - } catch (const cl::Error& e) { - CERR << "Error in convolve3: " << e.what() << ": " << e.err() - << std::endl; - throw; - } - } - - try { - sgemm_kernel.setArg(0, m_ceil); - sgemm_kernel.setArg(1, n_ceil); - sgemm_kernel.setArg(2, k_ceil); - sgemm_kernel.setArg(3, weights[0]); - sgemm_kernel.setArg(4, bufferV); - sgemm_kernel.setArg(5, bufferM); - - cl::NDRange local_sgemm = {mdimc, ndimc, 1}; - - cl::NDRange size_sgemm = {(m_ceil * mdimc) / mwg, (n_ceil * ndimc) / nwg, - (cl::size_type)WINOGRAD_TILE}; - - queue.enqueueNDRangeKernel(sgemm_kernel, cl::NullRange, size_sgemm, - local_sgemm); - } catch (const cl::Error& e) { - CERR << "Error in convolve3: " << e.what() << ": " << e.err() - << std::endl; - throw; - } - - try { - if (fuse_in_transform) { - // TODO : Eventually this might also be something tuneable? - constexpr auto dim_size = 2; - out_transform_bn_in_kernel.setArg(0, bufferM); - if (store_inout) { - out_transform_bn_in_kernel.setArg(1, bufferOut); - } else { - out_transform_bn_in_kernel.setArg(1, nullptr); - } - out_transform_bn_in_kernel.setArg(2, bufferV); - out_transform_bn_in_kernel.setArg(3, outputs); - out_transform_bn_in_kernel.setArg(4, m_ceil); - out_transform_bn_in_kernel.setArg(5, n_ceil); - // k_ceil of the next convolution - auto k_ceil2 = int(ceilMultiple(ceilMultiple(outputs, kwg), vwm)); - out_transform_bn_in_kernel.setArg(6, k_ceil2); - if (bufferResidual) { - out_transform_bn_in_kernel.setArg(7, *bufferResidual); - } else { - out_transform_bn_in_kernel.setArg(7, nullptr); - } - out_transform_bn_in_kernel.setArg(8, bn_weights[0]); - out_transform_bn_in_kernel.setArg(9, bn_weights[1]); - out_transform_bn_in_kernel.setArg( - 10, cl::Local(dim_size * width * height * sizeof(float))); - - queue.enqueueNDRangeKernel(out_transform_bn_in_kernel, cl::NullRange, - cl::NDRange(outputs, wgs, batch_size), - cl::NDRange(dim_size, wgs, 1)); - } else { - out_transform_bn_kernel.setArg(0, bufferM); - out_transform_bn_kernel.setArg(1, bufferOut); - out_transform_bn_kernel.setArg(2, outputs); - out_transform_bn_kernel.setArg(3, m_ceil); - out_transform_bn_kernel.setArg(4, n_ceil); - if (bufferResidual) { - out_transform_bn_kernel.setArg(5, *bufferResidual); - } else { - out_transform_bn_kernel.setArg(5, nullptr); - } - out_transform_bn_kernel.setArg(6, bn_weights[0]); - out_transform_bn_kernel.setArg(7, bn_weights[1]); - - queue.enqueueNDRangeKernel(out_transform_bn_kernel, cl::NullRange, - cl::NDRange(outputs, wgs, batch_size)); - } - } catch (const cl::Error& e) { - CERR << "Error in convolve3: " << e.what() << ": " << e.err() - << std::endl; - throw; - } -} - -void OpenCL_Network::convolve1(int channels, int outputs, - cl::Buffer& bufferInput, - cl::Buffer& bufferOutput, - cl::Buffer& bufferMerge, weight_slice_t weights, - int batch_size) const { - // fixed for 8x8. - constexpr int width = 8; - constexpr int height = 8; - constexpr int boardsize = width * height; - constexpr int rowTiles = 8; - - // Input channel grouping in multiples of 8. - constexpr int channelGroup = 8; - constexpr int channelShift = 3; - constexpr int rowGroup = 1; - // Assumes that if outputs > 16, then outputs is divisible by 16. - size_t outputGroup = std::min(outputs, 16); - - auto m_convolve_kernel = &opencl_thread_data.m_convolve1_kernel; - -#ifndef NDEBUG - // Total output size after reducing. - size_t outSize = width * height * outputs * sizeof(net_t); - - // Produce channel * output planes and merge them at the end. - size_t mergeSize = (channels >> channelShift) * outSize; - assert(mergeSize <= bufferMerge.getInfo()); -#endif - - // Copy the rows locally. - size_t stripSize = width * sizeof(float); - - int rowBuffer = std::min(channelGroup, 7); - size_t rowSize = channelGroup * outputGroup * rowBuffer * sizeof(float); - - cl::CommandQueue& queue = opencl_thread_data.m_commandqueue; - - try { - m_convolve_kernel->setArg(0, bufferInput); - m_convolve_kernel->setArg(1, bufferMerge); - m_convolve_kernel->setArg(2, weights[0]); - m_convolve_kernel->setArg(3, - cl::Local(stripSize * channelGroup * rowGroup)); - m_convolve_kernel->setArg(4, cl::Local(rowSize)); - - queue.enqueueNDRangeKernel( - *m_convolve_kernel, cl::NullRange, - cl::NDRange(channels, outputs, batch_size * rowTiles), - cl::NDRange(channelGroup, outputGroup, rowGroup)); - } catch (const cl::Error& e) { - CERR << "Error in convolve1: " << e.what() << ": " << e.err() - << std::endl; - throw; - } - - cl::Kernel& merge_kernel = opencl_thread_data.m_merge_kernel; - assert(channels % (1 << channelShift) == 0); - - try { - merge_kernel.setArg(0, bufferMerge); - merge_kernel.setArg(1, bufferOutput); - merge_kernel.setArg(2, channels >> channelShift); - merge_kernel.setArg(3, weights[1]); - merge_kernel.setArg(4, weights[2]); - - queue.enqueueNDRangeKernel(merge_kernel, cl::NullRange, - cl::NDRange(outputs, boardsize, batch_size), - cl::NDRange(std::min(8, outputs), 8, 1)); - } catch (const cl::Error& e) { - CERR << "Error in merge: " << e.what() << ": " << e.err() << std::endl; - throw; - } -} - -void OpenCL_Network::innerproduct(cl::Buffer& input, weight_slice_t weights, - weight_slice_t biases, cl::Buffer& output, - const int inputs, const int outputs, - const int relu, int batch_size) const { - auto sgemv_kernel = opencl_thread_data.m_sgemv_kernel; - cl::CommandQueue& queue = opencl_thread_data.m_commandqueue; - - // TODO: Tune these. - size_t wgs1 = 64; - size_t wpt1 = 1; - - auto m_ceil = int(ceilMultiple(outputs, wgs1 * wpt1)); - auto global_size = m_ceil / wpt1; - auto local_size = wgs1; - - try { - // Sets the kernel arguments. - sgemv_kernel.setArg(0, static_cast(outputs)); - sgemv_kernel.setArg(1, static_cast(inputs)); - sgemv_kernel.setArg(2, weights[0]); - sgemv_kernel.setArg(3, static_cast(0)); - sgemv_kernel.setArg(4, static_cast(inputs)); - sgemv_kernel.setArg(5, input); - sgemv_kernel.setArg(6, static_cast(0)); - sgemv_kernel.setArg(7, output); - sgemv_kernel.setArg(8, static_cast(0)); - sgemv_kernel.setArg(9, biases[0]); - sgemv_kernel.setArg(10, static_cast(relu)); - - queue.enqueueNDRangeKernel(sgemv_kernel, cl::NullRange, - cl::NDRange(global_size, batch_size), - cl::NDRange(local_size, 1)); - } catch (const cl::Error& e) { - CERR << "Error in innerproduct: " << e.what() << ": " << e.err() - << std::endl; - throw; - } -} - template static std::string opencl_dev_type_to_string(T type) { if (type == CL_DEVICE_TYPE_CPU) { @@ -668,7 +246,7 @@ void OpenCL::initialize(const int channels, const OpenCLParams& params) { p.getDevices(CL_DEVICE_TYPE_ALL, &devices); } catch (const cl::Error& e) { CERR << "Error getting device(s): " << e.what() << ": " << e.err() - << std::endl; + << std::endl; devices.clear(); } for (auto& d : devices) { @@ -679,10 +257,10 @@ void OpenCL::initialize(const int channels, const OpenCLParams& params) { << opencl_dev_type_to_string(d.getInfo()); CERR << "Device vendor: " << d.getInfo(); CERR << "Device driver: " << d.getInfo(); - CERR << "Device speed: " - << d.getInfo() << " MHZ"; - CERR << "Device cores: " - << d.getInfo() << " CU"; + CERR << "Device speed: " << d.getInfo() + << " MHZ"; + CERR << "Device cores: " << d.getInfo() + << " CU"; // assign score, try to find best device int this_score = 0; @@ -722,9 +300,9 @@ void OpenCL::initialize(const int channels, const OpenCLParams& params) { CERR << "Selected platform: " << best_platform.getInfo(); CERR << "Selected device: " - << trim_left(best_device.getInfo().c_str()); - CERR << "with OpenCL " << std::fixed << std::setprecision(1) - << best_version << " capability."; + << trim_left(best_device.getInfo().c_str()); + CERR << "with OpenCL " << std::fixed << std::setprecision(1) << best_version + << " capability."; cl::Context context; try { context = cl::Context(best_device); @@ -759,15 +337,16 @@ void OpenCL::initialize(const int channels, const OpenCLParams& params) { m_program.build(args.c_str()); } catch (const cl::Error&) { CERR << "Error building kernels: " - << m_program.getBuildInfo(m_device) << "."; + << m_program.getBuildInfo(m_device) << "."; throw std::runtime_error("Error building OpenCL kernels."); } - ensure_thread_initialized(); process_tuners(sgemm_tuners); + auto sgemm_kernel = cl::Kernel(m_program, "XgemmBatched"); + m_wavefront_size = - opencl_thread_data.m_sgemm_kernel + sgemm_kernel .getWorkGroupInfo( best_device); CERR << "Wavefront/Warp size: " << m_wavefront_size << std::endl; @@ -782,6 +361,19 @@ void OpenCL::initialize(const int channels, const OpenCLParams& params) { m_init_ok = true; } +std::unique_ptr OpenCL_Network::acquire_buffers() const { + std::lock_guard lock(m_pool_mutex); + if (m_buffers_pool.empty()) return std::make_unique(*this); + auto result = std::move(m_buffers_pool.back()); + m_buffers_pool.pop_back(); + return result; +} + +void OpenCL_Network::release_buffers(std::unique_ptr buffers) const { + std::lock_guard lock(m_pool_mutex); + m_buffers_pool.push_back(std::move(buffers)); +} + std::string OpenCL::get_device_name() { std::stringstream ss; diff --git a/src/neural/opencl/OpenCL.h b/src/neural/opencl/OpenCL.h index 79191488f5..7fe330cbd5 100644 --- a/src/neural/opencl/OpenCL.h +++ b/src/neural/opencl/OpenCL.h @@ -34,7 +34,8 @@ using net_t = float; #include #include "cl2.hpp" -#include "OpenCLParams.h" +#include "neural/opencl/OpenCLBuffers.h" +#include "neural/opencl/OpenCLParams.h" inline size_t ceilMultiple(size_t a, size_t b) { if (a % b == 0) return a; @@ -45,9 +46,11 @@ static constexpr auto WINOGRAD_P = 8 * 8 / 4; static constexpr auto WINOGRAD_TILE = 4 * 4; class OpenCL; +class OpenCLBuffers; class Layer { friend class OpenCL_Network; + friend class OpenCLBuffers; private: unsigned int channels{0}; @@ -62,34 +65,16 @@ class Layer { std::vector weights; }; -class ThreadData { - friend class OpenCL; - friend class OpenCL_Network; - - private: - bool m_is_initialized{false}; - cl::CommandQueue m_commandqueue; - cl::Kernel m_convolve1_kernel; - cl::Kernel m_merge_kernel; - cl::Kernel m_in_transform_kernel; - cl::Kernel m_sgemm_kernel; - cl::Kernel m_sgemv_kernel; - cl::Kernel m_out_transform_bn_kernel; - cl::Kernel m_out_transform_bn_in_kernel; - cl::Buffer m_inBuffer; - cl::Buffer m_inBuffer2; - cl::Buffer m_VBuffer; - cl::Buffer m_MBuffer; - cl::Buffer m_pinnedOutBuffer_pol; - cl::Buffer m_pinnedOutBuffer_val; - bool m_buffers_allocated{false}; -}; - class OpenCL_Network { + friend class OpenCLBuffers; + public: OpenCL_Network(OpenCL& opencl) : m_opencl(opencl), m_max_batch_size(1) {} - OpenCL& getOpenCL() { return m_opencl; } + std::unique_ptr acquire_buffers() const; + void release_buffers(std::unique_ptr) const; + + OpenCL& getOpenCL() const { return m_opencl; } size_t getMaxMatchSize() const { return m_max_batch_size; } @@ -172,50 +157,28 @@ class OpenCL_Network { size_t get_layer_count() const { return m_layers.size(); } - void forward(const std::vector& input, std::vector& output_pol, - std::vector& output_val, const int batch_size) const; - private: - using weight_slice_t = std::vector::const_iterator; - void push_weights(size_t layer, const std::vector& weights) { add_weights(layer, weights.size(), weights.data()); } void add_weights(size_t layer, size_t size, const float* weights); - void convolve3(int channels, int outputs, cl::Buffer& bufferIn, - cl::Buffer& bufferOut, cl::Buffer& bufferV, - cl::Buffer& bufferM, weight_slice_t weights, - cl::Buffer* bufferResidual, weight_slice_t bn_weights, - bool skip_in_transform, bool fuse_in_transform, - bool store_inout, int batch_size) const; - - void convolve1(int channels, int outputs, cl::Buffer& bufferInput, - cl::Buffer& bufferOutput, cl::Buffer& bufferMerge, - weight_slice_t weights, int batch_size) const; - - void innerproduct(cl::Buffer& input, weight_slice_t weights, - weight_slice_t biases, cl::Buffer& output, const int inputs, - const int outputs, const int relu, int batch_size) const; - OpenCL& m_opencl; size_t m_max_batch_size; - // this mutex is not required for correctness, but this exists simply - // because queue.finish() is a busy wait and having a lot of threads - // waiting here is counterproductive CPU-wise. At least std::mutex - // isn't busy wait so it should be better. - mutable std::mutex m_queue_finish_mutex; std::vector m_layers; + + mutable std::mutex m_pool_mutex; + mutable std::vector> m_buffers_pool; }; class OpenCL { friend class OpenCL_Network; + friend class OpenCLBuffers; friend class Tuner; public: void initialize(const int channels, const OpenCLParams& params); - void ensure_thread_initialized(void); std::string get_device_name(); std::vector get_sgemm_tuners(void); diff --git a/src/neural/opencl/OpenCLBuffers.cc b/src/neural/opencl/OpenCLBuffers.cc new file mode 100644 index 0000000000..43081e944c --- /dev/null +++ b/src/neural/opencl/OpenCLBuffers.cc @@ -0,0 +1,407 @@ +/* + This file is part of Leela Zero. + Copyright (C) 2017 Gian-Carlo Pascutto + + Leela Zero is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Leela Zero is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Leela Zero. If not, see . + */ + +#include "neural/opencl/OpenCLBuffers.h" + +OpenCLBuffers::OpenCLBuffers(const OpenCL_Network& opencl_net) + : m_opencl_net(opencl_net), m_opencl(opencl_net.getOpenCL()) { + auto& program = m_opencl.m_program; + auto& context = m_opencl.m_context; + auto& device = m_opencl.m_device; + + m_convolve1_kernel = cl::Kernel(program, "convolve1"); + m_merge_kernel = cl::Kernel(program, "merge_bn"); + m_in_transform_kernel = cl::Kernel(program, "in_transform"); + m_sgemm_kernel = cl::Kernel(program, "XgemmBatched"); + m_out_transform_bn_kernel = cl::Kernel(program, "out_transform_fused_bn"); + m_out_transform_bn_in_kernel = + cl::Kernel(program, "out_transform_fused_bn_in"); + m_sgemv_kernel = cl::Kernel(program, "Xgemv"); + m_commandqueue = cl::CommandQueue(context, device); + + auto& layers = m_opencl_net.m_layers; + + constexpr auto tiles = WINOGRAD_P; + constexpr auto width = 8; + constexpr auto height = 8; + + auto finalSize_pol = + layers[layers.size() - 2].ip_out_size * sizeof(net_t); + auto finalSize_val = layers.back().ip_out_size * sizeof(net_t); + + auto max_channels = unsigned{0}; + for (const auto& layer : layers) { + max_channels = + std::max(max_channels, std::max(layer.channels, layer.outputs)); + } + + const auto mwg = m_opencl.m_sgemm_tuners.mwg; + const auto nwg = m_opencl.m_sgemm_tuners.nwg; + const auto vwm = m_opencl.m_sgemm_tuners.vwm; + const auto vwn = m_opencl.m_sgemm_tuners.vwn; + + const auto m_ceil = ceilMultiple(ceilMultiple(max_channels, mwg), vwm); + const auto n_ceil = ceilMultiple(ceilMultiple(tiles, nwg), vwn); + + const auto max_batch_size = m_opencl_net.getMaxMatchSize(); + const auto alloc_inSize = + max_batch_size * width * height * max_channels * sizeof(net_t); + const auto alloc_vm_size = + max_batch_size * WINOGRAD_TILE * m_ceil * n_ceil * sizeof(net_t); + + auto v_zeros = std::vector(alloc_vm_size); + + m_inBuffer = cl::Buffer(m_opencl.m_context, CL_MEM_READ_WRITE, alloc_inSize); + m_inBuffer2 = cl::Buffer(m_opencl.m_context, CL_MEM_READ_WRITE, alloc_inSize); + m_VBuffer = cl::Buffer( + m_opencl.m_context, + CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, + alloc_vm_size, v_zeros.data(), nullptr); + m_MBuffer = + cl::Buffer(m_opencl.m_context, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, + alloc_vm_size); + + m_pinnedOutBuffer_pol = + cl::Buffer(m_opencl.m_context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, + max_batch_size * finalSize_pol); + m_pinnedOutBuffer_val = + cl::Buffer(m_opencl.m_context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, + max_batch_size * finalSize_val); +} + +void OpenCLBuffers::forward(const std::vector& input, + std::vector& output_pol, + std::vector& output_val, + const int batch_size) { + + auto& layers = m_opencl_net.m_layers; + + auto finalSize_pol = + layers[layers.size() - 2].ip_out_size * sizeof(net_t); + auto finalSize_val = layers.back().ip_out_size * sizeof(net_t); + + const auto inSize = sizeof(net_t) * input.size(); + m_commandqueue.enqueueWriteBuffer(m_inBuffer, CL_FALSE, 0, inSize, + input.data()); + + auto skip_in_trans = false; + for (auto iter = cbegin(layers); iter != cend(layers); iter++) { + const auto& layer = *iter; + const auto niter = std::next(iter); + + if (layer.is_input_convolution) { + assert(niter != cend(layers)); + auto conv_weights = begin(layer.weights); + auto bn_weights = begin(layer.weights) + 1; + auto skip_next_in_trans = false; + if (niter->is_residual_block) { + skip_next_in_trans = true; + } + convolve3(layer.channels, layer.outputs, m_inBuffer, m_inBuffer, + m_VBuffer, m_MBuffer, conv_weights, nullptr, bn_weights, + skip_in_trans, skip_next_in_trans, true, batch_size); + skip_in_trans = skip_next_in_trans; + } else if (layer.is_residual_block) { + assert(layer.channels == layer.outputs); + assert(niter != cend(layers)); + auto conv1_weights = begin(layer.weights); + auto bn1_weights = begin(layer.weights) + 1; + auto conv2_weights = begin(layer.weights) + 3; + auto bn2_weights = begin(layer.weights) + 4; + convolve3(layer.channels, layer.outputs, m_inBuffer, m_inBuffer2, + m_VBuffer, m_MBuffer, conv1_weights, nullptr, bn1_weights, + skip_in_trans, true, false, batch_size); + + auto skip_next_in_trans = false; + if (niter->is_residual_block) { + skip_next_in_trans = true; + } + convolve3(layer.channels, layer.outputs, m_inBuffer2, m_inBuffer, + m_VBuffer, m_MBuffer, conv2_weights, &m_inBuffer, bn2_weights, + true, skip_next_in_trans, true, batch_size); + skip_in_trans = skip_next_in_trans; + } else { + assert(layer.is_value || layer.is_policy); + + cl::Buffer out_buffer; + if (layer.is_policy) { + out_buffer = m_pinnedOutBuffer_pol; + } else { + out_buffer = m_pinnedOutBuffer_val; + } + + auto ip_w = begin(layer.weights) + 3; + auto ip_b = begin(layer.weights) + 4; + + convolve1(layer.channels, layer.outputs, m_inBuffer, m_inBuffer2, + m_VBuffer, begin(layer.weights), batch_size); + + innerproduct(m_inBuffer2, ip_w, ip_b, out_buffer, layer.ip_in_size, + layer.ip_out_size, layer.is_value, batch_size); + } + } + + auto pinnedOutBufferHost_pol = m_commandqueue.enqueueMapBuffer( + m_pinnedOutBuffer_pol, CL_FALSE, CL_MAP_READ, 0, + batch_size * finalSize_pol); + auto pinnedOutBufferHost_val = m_commandqueue.enqueueMapBuffer( + m_pinnedOutBuffer_val, CL_FALSE, CL_MAP_READ, 0, + batch_size * finalSize_val); + + m_commandqueue.finish(); + + std::memcpy(output_pol.data(), pinnedOutBufferHost_pol, + batch_size * finalSize_pol); + std::memcpy(output_val.data(), pinnedOutBufferHost_val, + batch_size * finalSize_val); + + m_commandqueue.enqueueUnmapMemObject(m_pinnedOutBuffer_pol, + pinnedOutBufferHost_pol); + m_commandqueue.enqueueUnmapMemObject(m_pinnedOutBuffer_val, + pinnedOutBufferHost_val); +} + +void OpenCLBuffers::convolve3(int channels, int outputs, cl::Buffer& bufferIn, + cl::Buffer& bufferOut, cl::Buffer& bufferV, + cl::Buffer& bufferM, weight_slice_t weights, + cl::Buffer* bufferResidual, + weight_slice_t bn_weights, bool skip_in_transform, + bool fuse_in_transform, bool store_inout, + int batch_size) { + auto mwg = m_opencl.m_sgemm_tuners.mwg; + auto nwg = m_opencl.m_sgemm_tuners.nwg; + auto kwg = m_opencl.m_sgemm_tuners.kwg; + auto vwm = m_opencl.m_sgemm_tuners.vwm; + auto vwn = m_opencl.m_sgemm_tuners.vwn; + auto mdimc = m_opencl.m_sgemm_tuners.mdimc; + auto ndimc = m_opencl.m_sgemm_tuners.ndimc; + auto wavefront_size = m_opencl.m_wavefront_size; + + assert(mwg != 0); + assert(nwg != 0); + assert(kwg != 0); + assert(mdimc != 0); + assert(ndimc != 0); + assert(vwm != 0); + assert(vwn != 0); + assert(wavefront_size != 0); + + constexpr auto tiles = WINOGRAD_P; + constexpr auto width = 8; + constexpr auto height = 8; + + auto wgs = ceilMultiple(tiles, wavefront_size); + auto m_ceil = int(ceilMultiple(ceilMultiple(outputs, mwg), vwm)); + auto n_ceil = int(ceilMultiple(ceilMultiple(batch_size * tiles, nwg), vwn)); + auto k_ceil = int(ceilMultiple(ceilMultiple(channels, kwg), vwm)); + + if (!skip_in_transform) { + try { + m_in_transform_kernel.setArg(0, bufferIn); + m_in_transform_kernel.setArg(1, bufferV); + m_in_transform_kernel.setArg(2, channels); + m_in_transform_kernel.setArg(3, k_ceil); + m_in_transform_kernel.setArg(4, n_ceil); + + m_commandqueue.enqueueNDRangeKernel( + m_in_transform_kernel, cl::NullRange, + cl::NDRange(wgs, channels, batch_size)); + } catch (const cl::Error& e) { + CERR << "Error in convolve3: " << e.what() << ": " << e.err() + << std::endl; + throw; + } + } + + try { + m_sgemm_kernel.setArg(0, m_ceil); + m_sgemm_kernel.setArg(1, n_ceil); + m_sgemm_kernel.setArg(2, k_ceil); + m_sgemm_kernel.setArg(3, weights[0]); + m_sgemm_kernel.setArg(4, bufferV); + m_sgemm_kernel.setArg(5, bufferM); + + cl::NDRange local_sgemm = {mdimc, ndimc, 1}; + + cl::NDRange size_sgemm = {(m_ceil * mdimc) / mwg, (n_ceil * ndimc) / nwg, + (cl::size_type)WINOGRAD_TILE}; + + m_commandqueue.enqueueNDRangeKernel(m_sgemm_kernel, cl::NullRange, + size_sgemm, local_sgemm); + } catch (const cl::Error& e) { + CERR << "Error in convolve3: " << e.what() << ": " << e.err() << std::endl; + throw; + } + + try { + if (fuse_in_transform) { + // TODO : Eventually this might also be something tuneable? + constexpr auto dim_size = 2; + m_out_transform_bn_in_kernel.setArg(0, bufferM); + if (store_inout) { + m_out_transform_bn_in_kernel.setArg(1, bufferOut); + } else { + m_out_transform_bn_in_kernel.setArg(1, nullptr); + } + m_out_transform_bn_in_kernel.setArg(2, bufferV); + m_out_transform_bn_in_kernel.setArg(3, outputs); + m_out_transform_bn_in_kernel.setArg(4, m_ceil); + m_out_transform_bn_in_kernel.setArg(5, n_ceil); + // k_ceil of the next convolution + auto k_ceil2 = int(ceilMultiple(ceilMultiple(outputs, kwg), vwm)); + m_out_transform_bn_in_kernel.setArg(6, k_ceil2); + if (bufferResidual) { + m_out_transform_bn_in_kernel.setArg(7, *bufferResidual); + } else { + m_out_transform_bn_in_kernel.setArg(7, nullptr); + } + m_out_transform_bn_in_kernel.setArg(8, bn_weights[0]); + m_out_transform_bn_in_kernel.setArg(9, bn_weights[1]); + m_out_transform_bn_in_kernel.setArg( + 10, cl::Local(dim_size * width * height * sizeof(float))); + + m_commandqueue.enqueueNDRangeKernel( + m_out_transform_bn_in_kernel, cl::NullRange, + cl::NDRange(outputs, wgs, batch_size), cl::NDRange(dim_size, wgs, 1)); + } else { + m_out_transform_bn_kernel.setArg(0, bufferM); + m_out_transform_bn_kernel.setArg(1, bufferOut); + m_out_transform_bn_kernel.setArg(2, outputs); + m_out_transform_bn_kernel.setArg(3, m_ceil); + m_out_transform_bn_kernel.setArg(4, n_ceil); + if (bufferResidual) { + m_out_transform_bn_kernel.setArg(5, *bufferResidual); + } else { + m_out_transform_bn_kernel.setArg(5, nullptr); + } + m_out_transform_bn_kernel.setArg(6, bn_weights[0]); + m_out_transform_bn_kernel.setArg(7, bn_weights[1]); + + m_commandqueue.enqueueNDRangeKernel( + m_out_transform_bn_kernel, cl::NullRange, + cl::NDRange(outputs, wgs, batch_size)); + } + } catch (const cl::Error& e) { + CERR << "Error in convolve3: " << e.what() << ": " << e.err() << std::endl; + throw; + } +} + +void OpenCLBuffers::convolve1(int channels, int outputs, + cl::Buffer& bufferInput, cl::Buffer& bufferOutput, + cl::Buffer& bufferMerge, weight_slice_t weights, + int batch_size) { + // fixed for 8x8. + constexpr int width = 8; + constexpr int height = 8; + constexpr int boardsize = width * height; + constexpr int rowTiles = 8; + + // Input channel grouping in multiples of 8. + constexpr int channelGroup = 8; + constexpr int channelShift = 3; + constexpr int rowGroup = 1; + // Assumes that if outputs > 16, then outputs is divisible by 16. + size_t outputGroup = std::min(outputs, 16); + +#ifndef NDEBUG + // Total output size after reducing. + size_t outSize = width * height * outputs * sizeof(net_t); + + // Produce channel * output planes and merge them at the end. + size_t mergeSize = (channels >> channelShift) * outSize; + assert(mergeSize <= bufferMerge.getInfo()); +#endif + + // Copy the rows locally. + size_t stripSize = width * sizeof(float); + + int rowBuffer = std::min(channelGroup, 7); + size_t rowSize = channelGroup * outputGroup * rowBuffer * sizeof(float); + + try { + m_convolve1_kernel.setArg(0, bufferInput); + m_convolve1_kernel.setArg(1, bufferMerge); + m_convolve1_kernel.setArg(2, weights[0]); + m_convolve1_kernel.setArg(3, + cl::Local(stripSize * channelGroup * rowGroup)); + m_convolve1_kernel.setArg(4, cl::Local(rowSize)); + + m_commandqueue.enqueueNDRangeKernel( + m_convolve1_kernel, cl::NullRange, + cl::NDRange(channels, outputs, batch_size * rowTiles), + cl::NDRange(channelGroup, outputGroup, rowGroup)); + } catch (const cl::Error& e) { + CERR << "Error in convolve1: " << e.what() << ": " << e.err() << std::endl; + throw; + } + + assert(channels % (1 << channelShift) == 0); + + try { + m_merge_kernel.setArg(0, bufferMerge); + m_merge_kernel.setArg(1, bufferOutput); + m_merge_kernel.setArg(2, channels >> channelShift); + m_merge_kernel.setArg(3, weights[1]); + m_merge_kernel.setArg(4, weights[2]); + + m_commandqueue.enqueueNDRangeKernel( + m_merge_kernel, cl::NullRange, + cl::NDRange(outputs, boardsize, batch_size), + cl::NDRange(std::min(8, outputs), 8, 1)); + } catch (const cl::Error& e) { + CERR << "Error in merge: " << e.what() << ": " << e.err() << std::endl; + throw; + } +} + +void OpenCLBuffers::innerproduct(cl::Buffer& input, weight_slice_t weights, + weight_slice_t biases, cl::Buffer& output, + const int inputs, const int outputs, + const int relu, int batch_size) { + // TODO: Tune these. + size_t wgs1 = 64; + size_t wpt1 = 1; + + auto m_ceil = int(ceilMultiple(outputs, wgs1 * wpt1)); + auto global_size = m_ceil / wpt1; + auto local_size = wgs1; + + try { + // Sets the kernel arguments. + m_sgemv_kernel.setArg(0, static_cast(outputs)); + m_sgemv_kernel.setArg(1, static_cast(inputs)); + m_sgemv_kernel.setArg(2, weights[0]); + m_sgemv_kernel.setArg(3, static_cast(0)); + m_sgemv_kernel.setArg(4, static_cast(inputs)); + m_sgemv_kernel.setArg(5, input); + m_sgemv_kernel.setArg(6, static_cast(0)); + m_sgemv_kernel.setArg(7, output); + m_sgemv_kernel.setArg(8, static_cast(0)); + m_sgemv_kernel.setArg(9, biases[0]); + m_sgemv_kernel.setArg(10, static_cast(relu)); + + m_commandqueue.enqueueNDRangeKernel(m_sgemv_kernel, cl::NullRange, + cl::NDRange(global_size, batch_size), + cl::NDRange(local_size, 1)); + } catch (const cl::Error& e) { + CERR << "Error in innerproduct: " << e.what() << ": " << e.err() + << std::endl; + throw; + } +} diff --git a/src/neural/opencl/OpenCLBuffers.h b/src/neural/opencl/OpenCLBuffers.h new file mode 100644 index 0000000000..1f6d537f5b --- /dev/null +++ b/src/neural/opencl/OpenCLBuffers.h @@ -0,0 +1,86 @@ +/* + This file is part of Leela Zero. + Copyright (C) 2017 Gian-Carlo Pascutto + + Leela Zero is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Leela Zero is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Leela Zero. If not, see . + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "neural/opencl/OpenCL.h" +#include "neural/opencl/OpenCLParams.h" +#include "neural/opencl/OpenCLTuner.h" +#include "utils/logging.h" + +class OpenCL_Network; + +class OpenCLBuffers { + friend class OpenCL; + friend class OpenCL_Network; + + public: + OpenCLBuffers(const OpenCL_Network& opencl_net); + + void forward(const std::vector& input, std::vector& output_pol, + std::vector& output_val, const int batch_size); + + private: + using weight_slice_t = std::vector::const_iterator; + + void convolve3(int channels, int outputs, cl::Buffer& bufferIn, + cl::Buffer& bufferOut, cl::Buffer& bufferV, + cl::Buffer& bufferM, weight_slice_t weights, + cl::Buffer* bufferResidual, weight_slice_t bn_weights, + bool skip_in_transform, bool fuse_in_transform, + bool store_inout, int batch_size); + + void convolve1(int channels, int outputs, cl::Buffer& bufferInput, + cl::Buffer& bufferOutput, cl::Buffer& bufferMerge, + weight_slice_t weights, int batch_size); + + void innerproduct(cl::Buffer& input, weight_slice_t weights, + weight_slice_t biases, cl::Buffer& output, const int inputs, + const int outputs, const int relu, int batch_size); + + const OpenCL_Network& m_opencl_net; + const OpenCL& m_opencl; + + cl::CommandQueue m_commandqueue; + cl::Kernel m_convolve1_kernel; + cl::Kernel m_merge_kernel; + cl::Kernel m_in_transform_kernel; + cl::Kernel m_sgemm_kernel; + cl::Kernel m_sgemv_kernel; + cl::Kernel m_out_transform_bn_kernel; + cl::Kernel m_out_transform_bn_in_kernel; + cl::Buffer m_inBuffer; + cl::Buffer m_inBuffer2; + cl::Buffer m_VBuffer; + cl::Buffer m_MBuffer; + cl::Buffer m_pinnedOutBuffer_pol; + cl::Buffer m_pinnedOutBuffer_val; +}; diff --git a/src/neural/opencl/network_opencl.cc b/src/neural/opencl/network_opencl.cc index 2b5a547491..175495b22b 100644 --- a/src/neural/opencl/network_opencl.cc +++ b/src/neural/opencl/network_opencl.cc @@ -16,12 +16,12 @@ along with Leela Chess. If not, see . */ +#include "neural/network.h" #include "neural/blas/batchnorm.h" #include "neural/blas/blas.h" #include "neural/blas/fully_connected_layer.h" #include "neural/blas/winograd_convolution3.h" #include "neural/factory.h" -#include "neural/network.h" #include "neural/opencl/OpenCL.h" #include "neural/opencl/OpenCLParams.h" @@ -57,11 +57,14 @@ struct OpenCLWeights { class OpenCLComputation : public NetworkComputation { public: - OpenCLComputation(const OpenCL_Network& opencl_net, - const OpenCLWeights& weights) - : opencl_net_(opencl_net), weights_(weights), policies_(), q_values_() {} + OpenCLComputation(const OpenCL_Network& opencl_net, const OpenCLWeights& weights) + : opencl_net_(opencl_net), weights_(weights), policies_(), q_values_() { + buffers_ = opencl_net.acquire_buffers(); + } - virtual ~OpenCLComputation() {} + virtual ~OpenCLComputation() { + opencl_net_.release_buffers(std::move(buffers_)); + } // Adds a sample to the batch. void AddInput(InputPlanes&& input) override { planes_.emplace_back(input); } @@ -91,7 +94,7 @@ class OpenCLComputation : public NetworkComputation { EncodePlanes(planes_[i + j], &input_data[j * kSquares * kInputPlanes]); } - opencl_net_.forward(input_data, output_pol, output_val, batch_size); + buffers_->forward(input_data, output_pol, output_val, batch_size); for (size_t j = 0; j < batch_size; j++) { std::vector policy(weights_.num_output_policies); @@ -139,6 +142,8 @@ class OpenCLComputation : public NetworkComputation { std::vector> policies_; std::vector q_values_; + + std::unique_ptr buffers_; }; void OpenCLComputation::EncodePlanes(const InputPlanes& sample, float* buffer) {