Skip to content

Commit

Permalink
initial version
Browse files Browse the repository at this point in the history
  • Loading branch information
bashbaug committed Sep 18, 2024
1 parent 979eb30 commit 69ccc10
Show file tree
Hide file tree
Showing 5 changed files with 243 additions and 0 deletions.
6 changes: 6 additions & 0 deletions include/CL/opencl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1799,6 +1799,12 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_LO
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_KERNEL_CLOCK_CAPABILITIES_KHR, cl_device_kernel_clock_capabilities_khr)
#endif /* cl_khr_kernel_clock */

#if defined(cl_ext_float_atomics)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT, cl_device_fp_atomic_capabilities_ext)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT, cl_device_fp_atomic_capabilities_ext)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT, cl_device_fp_atomic_capabilities_ext)
#endif /* cl_ext_float_atomics */

#if defined(cl_intel_command_queue_families)
CL_HPP_PARAM_NAME_CL_INTEL_COMMAND_QUEUE_FAMILIES_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // cl_intel_command_queue_families
Expand Down
10 changes: 10 additions & 0 deletions samples/16_floatatomics/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Copyright (c) 2019-2024 Ben Ashbaugh
#
# SPDX-License-Identifier: MIT

add_opencl_sample(
TEST
NUMBER 16
TARGET floatatomics
VERSION 120
SOURCES main.cpp)
21 changes: 21 additions & 0 deletions samples/16_floatatomics/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Floating-point Atomic Adds

## Sample Purpose

TODO

Inspired by: https://pipinspace.github.io/blog/atomic-float-addition-in-opencl.html

## Key APIs and Concepts

TODO

## Command Line Options

| Option | Default Value | Description |
|:--|:-:|:--|
| `-d <index>` | 0 | Specify the index of the OpenCL device in the platform to execute on the sample on.
| `-p <index>` | 0 | Specify the index of the OpenCL platform to execute the sample on.
| `-i <number>` | 16 | Specify the number of iterations to execute.
| `--gwx <number>` | 1024 | Specify the global work size to execute, which is also the number of floating-point atomics to perform.
| `-e` | N/A | Unconditionally use the emulated floating-point atomic add.
205 changes: 205 additions & 0 deletions samples/16_floatatomics/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
/*
// Copyright (c) 2019-2024 Ben Ashbaugh
//
// SPDX-License-Identifier: MIT
*/

#include <popl/popl.hpp>

#include <CL/opencl.hpp>

#include <chrono>
#include <cinttypes>

#include "util.hpp"

static const char kernelString[] = R"CLC(
inline float atomic_add_f(volatile global float* addr, float val)
{
#if defined(__opencl_c_ext_fp32_global_atomic_add) && !defined(EMULATE)
//#pragma message("using cl_ext_float_atomics")
return atomic_fetch_add_explicit((volatile global atomic_float*)addr, val, memory_order_relaxed);
#elif defined(cl_nv_pragma_unroll) && !defined(EMULATE)
//#pragma message("using PTX atomics")
float ret; asm volatile("atom.global.add.f32 %0,[%1],%2;":"=f"(ret):"l"(addr),"f"(val):"memory");
return ret;
#else // fallback, see: https://forums.developer.nvidia.com/t/atomicadd-float-float-atomicmul-float-float/14639/7
//#pragma message("using emulated float atomics")
float ret = atomic_xchg(addr, 0.0f);
float old = ret + val;
while((old = atomic_xchg(addr, old)) != 0.0f) {
old = atomic_xchg(addr, 0.0f) + old;
}
return ret;
#endif
}
kernel void FloatAtomicTest(global float* dst)
{
atomic_add_f(dst, 1.0f);
}
)CLC";

static void PrintFloatAtomicCapabilities(
cl_device_fp_atomic_capabilities_ext caps )
{
if (caps & CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT ) printf("\t\tCL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT\n");
if (caps & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT ) printf("\t\tCL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT\n");
if (caps & CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT ) printf("\t\tCL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT\n");
if (caps & CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT ) printf("\t\tCL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT\n");
if (caps & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT ) printf("\t\tCL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT\n");
if (caps & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT ) printf("\t\tCL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT\n");

cl_device_command_buffer_capabilities_khr extra = caps & ~(
CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT |
CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT |
CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT |
CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT |
CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT |
CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT );
if (extra) {
printf("\t\t(Unknown capability: %016" PRIx64 ")\n", extra);
}
}

int main(
int argc,
char** argv )
{
int platformIndex = 0;
int deviceIndex = 0;

size_t iterations = 16;
size_t gwx = 1024 * 1024;

bool emulate = false;

{
popl::OptionParser op("Supported Options");
op.add<popl::Value<int>>("p", "platform", "Platform Index", platformIndex, &platformIndex);
op.add<popl::Value<int>>("d", "device", "Device Index", deviceIndex, &deviceIndex);
op.add<popl::Value<size_t>>("i", "iterations", "Iterations", iterations, &iterations);
op.add<popl::Value<size_t>>("", "gwx", "Global Work Size X AKA Number of Atomics", gwx, &gwx);
op.add<popl::Switch>("e", "emulate", "Unconditionally Emulate Float Atomics", &emulate);

bool printUsage = false;
try {
op.parse(argc, argv);
} catch (std::exception& e) {
fprintf(stderr, "Error: %s\n\n", e.what());
printUsage = true;
}

if (printUsage || !op.unknown_options().empty() || !op.non_option_args().empty()) {
fprintf(stderr,
"Usage: floatatomics [options]\n"
"%s", op.help().c_str());
return -1;
}
}

std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);

printf("Running on platform: %s\n",
platforms[platformIndex].getInfo<CL_PLATFORM_NAME>().c_str() );

std::vector<cl::Device> devices;
platforms[platformIndex].getDevices(CL_DEVICE_TYPE_ALL, &devices);

printf("Running on device: %s\n",
devices[deviceIndex].getInfo<CL_DEVICE_NAME>().c_str() );

if (checkDeviceForExtension(devices[deviceIndex], CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME)) {
printf("Device supports " CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME ".\n");

cl_device_fp_atomic_capabilities_ext spcaps =
devices[deviceIndex].getInfo<CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT>();
printf("CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT:\n");
PrintFloatAtomicCapabilities(spcaps);

cl_device_fp_atomic_capabilities_ext dpcaps =
devices[deviceIndex].getInfo<CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT>();
printf("CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT:\n");
PrintFloatAtomicCapabilities(dpcaps);

cl_device_fp_atomic_capabilities_ext hpcaps =
devices[deviceIndex].getInfo<CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT>();
printf("CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT:\n");
PrintFloatAtomicCapabilities(hpcaps);

if (spcaps & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT == 0) {
printf("Device does not support fp32 atomic add.\n");
}
} else {
printf("Device does not support " CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME ".\n");
}

cl::Context context{devices[deviceIndex]};
cl::CommandQueue commandQueue{context, devices[deviceIndex]};

cl::Program program{ context, kernelString };
std::string buildOptions = "-cl-std=CL3.0";

if (emulate) {
printf("Forcing emulation.\n");
buildOptions += " -DEMULATE";
}

program.build(buildOptions);
cl::Kernel kernel = cl::Kernel{ program, "FloatAtomicTest" };

cl::Buffer deviceMemDst = cl::Buffer{
context,
CL_MEM_READ_WRITE,
sizeof(cl_float) };

// execution
{
kernel.setArg(0, deviceMemDst);

// Ensure the queue is empty and no processing is happening
// on the device before starting the timer.
commandQueue.finish();

auto start = std::chrono::system_clock::now();
for( size_t i = 0; i < iterations; i++ )
{
cl_float zero = 0.0f;
commandQueue.enqueueFillBuffer(
deviceMemDst,
zero,
0,
sizeof(zero));
commandQueue.enqueueNDRangeKernel(
kernel,
cl::NullRange,
cl::NDRange{gwx});
}

// Ensure all processing is complete before stopping the timer.
commandQueue.finish();

auto end = std::chrono::system_clock::now();
std::chrono::duration<float> elapsed_seconds = end - start;
printf("Finished in %f seconds\n", elapsed_seconds.count());
}

// validation
{
cl_float result = 0.0f;
commandQueue.enqueueReadBuffer(
deviceMemDst,
CL_TRUE,
0,
sizeof(result),
&result);
if (result != (float)gwx) {
printf("Error: expected %f, got %f!\n", (float)gwx, result);
} else {
printf("Success.\n");
}
}

return 0;
}
1 change: 1 addition & 0 deletions samples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ add_subdirectory( 05_spirvkernelfromfile )
add_subdirectory( 06_ndrangekernelfromfile )

add_subdirectory( 10_queueexperiments )
add_subdirectory( 16_floatatomics )

set(BUILD_EXTENSION_SAMPLES TRUE)
if(NOT TARGET OpenCLExt)
Expand Down

0 comments on commit 69ccc10

Please sign in to comment.