Skip to content

Commit

Permalink
Handle subdevice partition correctly
Browse files Browse the repository at this point in the history
  • Loading branch information
PietroGhg committed Apr 11, 2024
1 parent 461cf6e commit c594cdc
Show file tree
Hide file tree
Showing 8 changed files with 203 additions and 96 deletions.
14 changes: 8 additions & 6 deletions source/adapters/native_cpu/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
case UR_DEVICE_INFO_LINKER_AVAILABLE:
return ReturnValue(bool{false});
case UR_DEVICE_INFO_MAX_COMPUTE_UNITS:
return ReturnValue(static_cast<uint32_t>(
hDevice->tp.num_threads()));
return ReturnValue(static_cast<uint32_t>(hDevice->tp.num_threads()));
case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES:
return ReturnValue(uint32_t{0});
case UR_DEVICE_INFO_SUPPORTED_PARTITIONS:
Expand Down Expand Up @@ -139,7 +138,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS:
return ReturnValue(uint32_t{3});
case UR_DEVICE_INFO_PARTITION_TYPE:
return ReturnValue(ur_device_partition_property_t{});
if (pPropSizeRet) {
*pPropSizeRet = 0;
}
return UR_RESULT_SUCCESS;
case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION:
return ReturnValue("");
case UR_DEVICE_INFO_QUEUE_PROPERTIES:
Expand All @@ -159,8 +161,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT:
case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE:
case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF:
// todo: how can we query vector width in a platform
// indipendent way?
// TODO: How can we query vector width in a platform
// independent way?
case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR:
return ReturnValue(uint32_t{32});
case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT:
Expand Down Expand Up @@ -266,7 +268,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
case UR_DEVICE_INFO_ATOMIC_64:
return ReturnValue(bool{1});
case UR_DEVICE_INFO_BFLOAT16:
return ReturnValue(bool{1});
return ReturnValue(bool{0});
case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT:
return ReturnValue(bool{0});
case UR_DEVICE_INFO_IMAGE_SRGB:
Expand Down
10 changes: 2 additions & 8 deletions source/adapters/native_cpu/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,12 @@

#pragma once

#include <ur/ur.hpp>
#include "threadpool.hpp"
#include <ur/ur.hpp>

struct ur_device_handle_t_ {
native_cpu::threadpool_t tp;
ur_device_handle_t_(ur_platform_handle_t ArgPlt) : Platform(ArgPlt) {
tp.start();
}

~ur_device_handle_t_() {
tp.stop();
}
ur_device_handle_t_(ur_platform_handle_t ArgPlt) : Platform(ArgPlt) {}

ur_platform_handle_t Platform;
};
157 changes: 135 additions & 22 deletions source/adapters/native_cpu/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,17 @@
//
//===----------------------------------------------------------------------===//
#include <array>
#include <cstddef>
#include <cstdint>
#include <vector>

#include "ur_api.h"

#include "common.hpp"
#include "kernel.hpp"
#include "memory.hpp"
#include "threadpool.hpp"
#include "queue.hpp"
#include "threadpool.hpp"

namespace native_cpu {
struct NDRDescT {
Expand All @@ -37,9 +39,29 @@ struct NDRDescT {
GlobalOffset[I] = 0;
}
}

void dump(std::ostream &os) const {
os << "GlobalSize: " << GlobalSize[0] << " " << GlobalSize[1] << " "
<< GlobalSize[2] << "\n";
os << "LocalSize: " << LocalSize[0] << " " << LocalSize[1] << " "
<< LocalSize[2] << "\n";
os << "GlobalOffset: " << GlobalOffset[0] << " " << GlobalOffset[1] << " "
<< GlobalOffset[2] << "\n";
}
};
} // namespace native_cpu

#ifdef NATIVECPU_USE_OCK
static native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr,
size_t itemsPerThread) {
native_cpu::state resized_state(
ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], itemsPerThread,
ndr.LocalSize[1], ndr.LocalSize[2], ndr.GlobalOffset[0],
ndr.GlobalOffset[1], ndr.GlobalOffset[2]);
return resized_state;
}
#endif

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
Expand All @@ -61,38 +83,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(

// TODO: add proper error checking
// TODO: add proper event dep management
native_cpu::NDRDescT ndr(workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize);
auto& tp = hQueue->device->tp;
native_cpu::NDRDescT ndr(workDim, pGlobalWorkOffset, pGlobalWorkSize,
pLocalWorkSize);
auto &tp = hQueue->device->tp;
const size_t numParallelThreads = tp.num_threads();
hKernel->updateMemPool(numParallelThreads);
std::vector<std::future<void>> futures;
std::vector<std::function<void(size_t, ur_kernel_handle_t_)>> groups;
auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
bool isLocalSizeOne =
ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1;


native_cpu::state state(ndr.GlobalSize[0], ndr.GlobalSize[1],
ndr.GlobalSize[2], ndr.LocalSize[0], ndr.LocalSize[1],
ndr.LocalSize[2], ndr.GlobalOffset[0],
ndr.GlobalOffset[1], ndr.GlobalOffset[2]);
if (isLocalSizeOne) {
// If the local size is one, we make the assumption that we are running a
// parallel_for over a sycl::range Todo: we could add compiler checks and
// kernel properties for this (e.g. check that no barriers are called, no
// local memory args).

auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
#ifndef NATIVECPU_USE_OCK
hKernel->handleLocalArgs(1, 0);
for (unsigned g2 = 0; g2 < numWG2; g2++) {
for (unsigned g1 = 0; g1 < numWG1; g1++) {
for (unsigned g0 = 0; g0 < numWG0; g0++) {
#ifdef NATIVECPU_USE_OCK
state.update(g0, g1, g2);
hKernel->_subhandler(hKernel->_args.data(), &state);
#else
for (unsigned local2 = 0; local2 < ndr.LocalSize[2]; local2++) {
for (unsigned local1 = 0; local1 < ndr.LocalSize[1]; local1++) {
for (unsigned local0 = 0; local0 < ndr.LocalSize[0]; local0++) {
Expand All @@ -101,13 +110,118 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
}
}
}
#endif
}
}
}
#else
bool isLocalSizeOne =
ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1;
if (isLocalSizeOne && ndr.GlobalSize[0] > numParallelThreads) {
// If the local size is one, we make the assumption that we are running a
// parallel_for over a sycl::range.
// Todo: we could add compiler checks and
// kernel properties for this (e.g. check that no barriers are called, no
// local memory args).

// Todo: this assumes that dim 0 is the best dimension over which we want to
// parallelize

// Since we also vectorize the kernel, and vectorization happens within the
// work group loop, it's better to have a large-ish local size. We can
// divide the global range by the number of threads, set that as the local
// size and peel everything else.

size_t new_num_work_groups_0 = numParallelThreads;
size_t itemsPerThread = ndr.GlobalSize[0] / numParallelThreads;

for (unsigned g2 = 0; g2 < numWG2; g2++) {
for (unsigned g1 = 0; g1 < numWG1; g1++) {
for (unsigned g0 = 0; g0 < new_num_work_groups_0; g0 += 1) {
futures.emplace_back(
tp.schedule_task([&ndr = std::as_const(ndr), itemsPerThread,
hKernel, g0, g1, g2](size_t) {
native_cpu::state resized_state =
getResizedState(ndr, itemsPerThread);
resized_state.update(g0, g1, g2);
hKernel->_subhandler(hKernel->_args.data(), &resized_state);
}));
}
// Peel the remaining work items. Since the local size is 1, we iterate
// over the work groups.
for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0;
g0++) {
state.update(g0, g1, g2);
hKernel->_subhandler(hKernel->_args.data(), &state);
}
}
}

} else {
// We are running a parallel_for over an nd_range

if (numWG1 * numWG2 >= numParallelThreads) {
// Dimensions 1 and 2 have enough work, split them across the threadpool
for (unsigned g2 = 0; g2 < numWG2; g2++) {
for (unsigned g1 = 0; g1 < numWG1; g1++) {
futures.emplace_back(
tp.schedule_task([state, kernel = *hKernel, numWG0, g1, g2,
numParallelThreads](size_t threadId) mutable {
for (unsigned g0 = 0; g0 < numWG0; g0++) {
kernel.handleLocalArgs(numParallelThreads, threadId);
state.update(g0, g1, g2);
kernel._subhandler(kernel._args.data(), &state);
}
}));
}
}
} else {
// Split dimension 0 across the threadpool
// Here we try to create groups of workgroups in order to reduce
// synchronization overhead
for (unsigned g2 = 0; g2 < numWG2; g2++) {
for (unsigned g1 = 0; g1 < numWG1; g1++) {
for (unsigned g0 = 0; g0 < numWG0; g0++) {
groups.push_back(
[state, g0, g1, g2, numParallelThreads](
size_t threadId, ur_kernel_handle_t_ kernel) mutable {
kernel.handleLocalArgs(numParallelThreads, threadId);
state.update(g0, g1, g2);
kernel._subhandler(kernel._args.data(), &state);
});
}
}
}
auto numGroups = groups.size();
auto groupsPerThread = numGroups / numParallelThreads;
auto remainder = numGroups % numParallelThreads;
for (unsigned thread = 0; thread < numParallelThreads; thread++) {
futures.emplace_back(tp.schedule_task(
[&groups, thread, groupsPerThread, hKernel](size_t threadId) {
for (unsigned i = 0; i < groupsPerThread; i++) {
auto index = thread * groupsPerThread + i;
groups[index](threadId, *hKernel);
}
}));
}

// schedule the remaining tasks
if (remainder) {
futures.emplace_back(
tp.schedule_task([&groups, remainder,
scheduled = numParallelThreads * groupsPerThread,
hKernel](size_t threadId) {
for (unsigned i = 0; i < remainder; i++) {
auto index = scheduled + i;
groups[index](threadId, *hKernel);
}
}));
}
}
}

for (auto &f : futures)
f.get();
#endif // NATIVECPU_USE_OCK
// TODO: we should avoid calling clear here by avoiding using push_back
// in setKernelArgs.
hKernel->_args.clear();
Expand Down Expand Up @@ -553,4 +667,3 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe(

DIE_NO_IMPLEMENTATION;
}

17 changes: 8 additions & 9 deletions source/adapters/native_cpu/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,25 +40,25 @@ struct ur_kernel_handle_t_ : RefCounted {
ur_kernel_handle_t_(const char *name, nativecpu_task_t subhandler)
: _name{name}, _subhandler{std::move(subhandler)} {}

ur_kernel_handle_t_(const ur_kernel_handle_t_& other) : _name(other._name), _subhandler(other._subhandler),
_args(other._args), _localArgInfo(other._localArgInfo), _localMemPool(other._localMemPool), _localMemPoolSize(other._localMemPoolSize) {
ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
: _name(other._name), _subhandler(other._subhandler), _args(other._args),
_localArgInfo(other._localArgInfo), _localMemPool(other._localMemPool),
_localMemPoolSize(other._localMemPoolSize) {
incrementReferenceCount();
}

~ur_kernel_handle_t_() {
decrementReferenceCount();
if (_refCount == 0) {
if (decrementReferenceCount() == 0) {
free(_localMemPool);
}

}

const char *_name;
nativecpu_task_t _subhandler;
std::vector<native_cpu::NativeCPUArgDesc> _args;
std::vector<local_arg_info_t> _localArgInfo;

// To be called before enqueing the kernel.
// To be called before enqueueing the kernel.
void updateMemPool(size_t numParallelThreads) {
// compute requested size.
size_t reqSize = 0;
Expand All @@ -69,7 +69,7 @@ struct ur_kernel_handle_t_ : RefCounted {
return;
}
// realloc handles nullptr case
_localMemPool = (char*)realloc(_localMemPool, reqSize);
_localMemPool = (char *)realloc(_localMemPool, reqSize);
_localMemPoolSize = reqSize;
}

Expand All @@ -86,7 +86,6 @@ struct ur_kernel_handle_t_ : RefCounted {
}

private:
char* _localMemPool = nullptr;
char *_localMemPool = nullptr;
size_t _localMemPoolSize = 0;
};

5 changes: 5 additions & 0 deletions source/adapters/native_cpu/nativecpu_state.hpp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ struct state {
size_t MLocal_id[3];
size_t MNumGroups[3];
size_t MGlobalOffset[3];
uint32_t NumSubGroups, SubGroup_id, SubGroup_local_id, SubGroup_size;
state(size_t globalR0, size_t globalR1, size_t globalR2, size_t localR0,
size_t localR1, size_t localR2, size_t globalO0, size_t globalO1,
size_t globalO2)
Expand All @@ -36,6 +37,10 @@ struct state {
MLocal_id[0] = 0;
MLocal_id[1] = 0;
MLocal_id[2] = 0;
NumSubGroups = 32;
SubGroup_id = 0;
SubGroup_local_id = 0;
SubGroup_size = 1;
}

void update(size_t group0, size_t group1, size_t group2, size_t local0,
Expand Down
3 changes: 1 addition & 2 deletions source/adapters/native_cpu/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
#include "device.hpp"

struct ur_queue_handle_t_ : RefCounted {
ur_device_handle_t_ *device;
ur_device_handle_t_ *const device;

ur_queue_handle_t_(ur_device_handle_t_ *device) : device(device) {}

};
Loading

0 comments on commit c594cdc

Please sign in to comment.