From 2676d8588178c27f5d7000185e3f2ec06e2fcd9d Mon Sep 17 00:00:00 2001
From: Patrick Stotko <stotko@cs.uni-bonn.de>
Date: Mon, 18 Nov 2024 15:29:04 +0100
Subject: [PATCH] atomic: Add non-member functions for fetch_min and fetch_max

---
 src/stdgpu/atomic.cuh             | 138 +++++++----
 src/stdgpu/impl/atomic_detail.cuh | 100 +++++---
 tests/stdgpu/atomic.inc           | 364 +++++++++++++++++++++---------
 3 files changed, 414 insertions(+), 188 deletions(-)
diff --git a/src/stdgpu/atomic.cuh b/src/stdgpu/atomic.cuh
index b1bd86fa9..36c5f4131 100644
--- a/src/stdgpu/atomic.cuh
+++ b/src/stdgpu/atomic.cuh
@@ -283,54 +283,54 @@ public:
     fetch_sub(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
 
     /**
-     * \brief Atomically computes and stores the bitwise AND of the stored value and the given argument
-     * \param[in] arg The other argument of bitwise AND
+     * \brief Atomically computes and stores the minimum of the stored value and the given argument
+     * \param[in] arg The other argument of minimum
      * \param[in] order The memory order
      * \return The old value
      */
-    template <STDGPU_DETAIL_OVERLOAD_IF(std::is_integral_v<T>)>
+    template <STDGPU_DETAIL_OVERLOAD_IF(std::is_integral_v<T> || std::is_floating_point_v<T>)>
     STDGPU_DEVICE_ONLY T
-    fetch_and(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
+    fetch_min(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
 
     /**
-     * \brief Atomically computes and stores the bitwise OR of the stored value and the given argument
-     * \param[in] arg The other argument of bitwise OR
+     * \brief Atomically computes and stores the maximum of the stored value and the given argument
+     * \param[in] arg The other argument of maximum
      * \param[in] order The memory order
      * \return The old value
      */
-    template <STDGPU_DETAIL_OVERLOAD_IF(std::is_integral_v<T>)>
+    template <STDGPU_DETAIL_OVERLOAD_IF(std::is_integral_v<T> || std::is_floating_point_v<T>)>
     STDGPU_DEVICE_ONLY T
-    fetch_or(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
+    fetch_max(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
 
     /**
-     * \brief Atomically computes and stores the bitwise XOR of the stored value and the given argument
-     * \param[in] arg The other argument of bitwise XOR
+     * \brief Atomically computes and stores the bitwise AND of the stored value and the given argument
+     * \param[in] arg The other argument of bitwise AND
      * \param[in] order The memory order
      * \return The old value
      */
     template <STDGPU_DETAIL_OVERLOAD_IF(std::is_integral_v<T>)>
     STDGPU_DEVICE_ONLY T
-    fetch_xor(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
+    fetch_and(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
 
     /**
-     * \brief Atomically computes and stores the minimum of the stored value and the given argument
-     * \param[in] arg The other argument of minimum
+     * \brief Atomically computes and stores the bitwise OR of the stored value and the given argument
+     * \param[in] arg The other argument of bitwise OR
      * \param[in] order The memory order
      * \return The old value
      */
-    template <STDGPU_DETAIL_OVERLOAD_IF(std::is_integral_v<T> || std::is_floating_point_v<T>)>
+    template <STDGPU_DETAIL_OVERLOAD_IF(std::is_integral_v<T>)>
     STDGPU_DEVICE_ONLY T
-    fetch_min(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
+    fetch_or(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
 
     /**
-     * \brief Atomically computes and stores the maximum of the stored value and the given argument
-     * \param[in] arg The other argument of maximum
+     * \brief Atomically computes and stores the bitwise XOR of the stored value and the given argument
+     * \param[in] arg The other argument of bitwise XOR
      * \param[in] order The memory order
      * \return The old value
      */
-    template <STDGPU_DETAIL_OVERLOAD_IF(std::is_integral_v<T> || std::is_floating_point_v<T>)>
+    template <STDGPU_DETAIL_OVERLOAD_IF(std::is_integral_v<T>)>
     STDGPU_DEVICE_ONLY T
-    fetch_max(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
+    fetch_xor(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
 
     /**
      * \brief Atomically computes and stores the incrementation of the value and modulus with arg
@@ -571,54 +571,54 @@ public:
     fetch_sub(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
 
     /**
-     * \brief Atomically computes and stores the bitwise AND of the stored value and the given argument
-     * \param[in] arg The other argument of bitwise AND
+     * \brief Atomically computes and stores the minimum of the stored value and the given argument
+     * \param[in] arg The other argument of minimum
      * \param[in] order The memory order
      * \return The old value
      */
-    template <STDGPU_DETAIL_OVERLOAD_IF(std::is_integral_v<T>)>
+    template <STDGPU_DETAIL_OVERLOAD_IF(std::is_integral_v<T> || std::is_floating_point_v<T>)>
     STDGPU_DEVICE_ONLY T
-    fetch_and(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
+    fetch_min(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
 
     /**
-     * \brief Atomically computes and stores the bitwise OR of the stored value and the given argument
-     * \param[in] arg The other argument of bitwise OR
+     * \brief Atomically computes and stores the maximum of the stored value and the given argument
+     * \param[in] arg The other argument of maximum
      * \param[in] order The memory order
      * \return The old value
      */
-    template <STDGPU_DETAIL_OVERLOAD_IF(std::is_integral_v<T>)>
+    template <STDGPU_DETAIL_OVERLOAD_IF(std::is_integral_v<T> || std::is_floating_point_v<T>)>
     STDGPU_DEVICE_ONLY T
-    fetch_or(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
+    fetch_max(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
 
     /**
-     * \brief Atomically computes and stores the bitwise XOR of the stored value and the given argument
-     * \param[in] arg The other argument of bitwise XOR
+     * \brief Atomically computes and stores the bitwise AND of the stored value and the given argument
+     * \param[in] arg The other argument of bitwise AND
      * \param[in] order The memory order
      * \return The old value
      */
     template <STDGPU_DETAIL_OVERLOAD_IF(std::is_integral_v<T>)>
     STDGPU_DEVICE_ONLY T
-    fetch_xor(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
+    fetch_and(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
 
     /**
-     * \brief Atomically computes and stores the minimum of the stored value and the given argument
-     * \param[in] arg The other argument of minimum
+     * \brief Atomically computes and stores the bitwise OR of the stored value and the given argument
+     * \param[in] arg The other argument of bitwise OR
      * \param[in] order The memory order
      * \return The old value
      */
-    template <STDGPU_DETAIL_OVERLOAD_IF(std::is_integral_v<T> || std::is_floating_point_v<T>)>
+    template <STDGPU_DETAIL_OVERLOAD_IF(std::is_integral_v<T>)>
     STDGPU_DEVICE_ONLY T
-    fetch_min(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
+    fetch_or(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
 
     /**
-     * \brief Atomically computes and stores the maximum of the stored value and the given argument
-     * \param[in] arg The other argument of maximum
+     * \brief Atomically computes and stores the bitwise XOR of the stored value and the given argument
+     * \param[in] arg The other argument of bitwise XOR
      * \param[in] order The memory order
      * \return The old value
      */
-    template <STDGPU_DETAIL_OVERLOAD_IF(std::is_integral_v<T> || std::is_floating_point_v<T>)>
+    template <STDGPU_DETAIL_OVERLOAD_IF(std::is_integral_v<T>)>
     STDGPU_DEVICE_ONLY T
-    fetch_max(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
+    fetch_xor(const T arg, const memory_order order = memory_order_seq_cst) noexcept;
 
     /**
      * \brief Atomically computes and stores the incrementation of the value and modulus with arg
@@ -882,6 +882,56 @@ atomic_fetch_sub_explicit(atomic<T, Allocator>* obj,
                           const typename atomic<T, Allocator>::difference_type arg,
                           const memory_order order) noexcept;
 
+/**
+ * \ingroup atomic
+ * \brief Atomically computes and stores the minimum of the stored value and the given argument
+ * \param[in] obj The atomic object
+ * \param[in] arg The other argument of minimum
+ * \return The old value
+ */
+template <typename T, typename Allocator>
+STDGPU_DEVICE_ONLY T
+atomic_fetch_min(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>::value_type arg) noexcept;
+
+/**
+ * \ingroup atomic
+ * \brief Atomically computes and stores the minimum of the stored value and the given argument
+ * \param[in] obj The atomic object
+ * \param[in] arg The other argument of minimum
+ * \param[in] order The memory order
+ * \return The old value
+ */
+template <typename T, typename Allocator>
+STDGPU_DEVICE_ONLY T
+atomic_fetch_min_explicit(atomic<T, Allocator>* obj,
+                          const typename atomic<T, Allocator>::value_type arg,
+                          const memory_order order) noexcept;
+
+/**
+ * \ingroup atomic
+ * \brief Atomically computes and stores the maximum of the stored value and the given argument
+ * \param[in] obj The atomic object
+ * \param[in] arg The other argument of maximum
+ * \return The old value
+ */
+template <typename T, typename Allocator>
+STDGPU_DEVICE_ONLY T
+atomic_fetch_max(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>::value_type arg) noexcept;
+
+/**
+ * \ingroup atomic
+ * \brief Atomically computes and stores the maximum of the stored value and the given argument
+ * \param[in] obj The atomic object
+ * \param[in] arg The other argument of maximum
+ * \param[in] order The memory order
+ * \return The old value
+ */
+template <typename T, typename Allocator>
+STDGPU_DEVICE_ONLY T
+atomic_fetch_max_explicit(atomic<T, Allocator>* obj,
+                          const typename atomic<T, Allocator>::value_type arg,
+                          const memory_order order) noexcept;
+
 /**
  * \ingroup atomic
  * \brief Atomically computes and stores the addition of the stored value and the given argument
@@ -891,7 +941,7 @@ atomic_fetch_sub_explicit(atomic<T, Allocator>* obj,
  */
 template <typename T, typename Allocator>
 STDGPU_DEVICE_ONLY T
-atomic_fetch_and(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>::difference_type arg) noexcept;
+atomic_fetch_and(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>::value_type arg) noexcept;
 
 /**
  * \ingroup atomic
@@ -904,7 +954,7 @@ atomic_fetch_and(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>:
 template <typename T, typename Allocator>
 STDGPU_DEVICE_ONLY T
 atomic_fetch_and_explicit(atomic<T, Allocator>* obj,
-                          const typename atomic<T, Allocator>::difference_type arg,
+                          const typename atomic<T, Allocator>::value_type arg,
                           const memory_order order) noexcept;
 
 /**
@@ -916,7 +966,7 @@ atomic_fetch_and_explicit(atomic<T, Allocator>* obj,
  */
 template <typename T, typename Allocator>
 STDGPU_DEVICE_ONLY T
-atomic_fetch_or(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>::difference_type arg) noexcept;
+atomic_fetch_or(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>::value_type arg) noexcept;
 
 /**
  * \ingroup atomic
@@ -929,7 +979,7 @@ atomic_fetch_or(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>::
 template <typename T, typename Allocator>
 STDGPU_DEVICE_ONLY T
 atomic_fetch_or_explicit(atomic<T, Allocator>* obj,
-                         const typename atomic<T, Allocator>::difference_type arg,
+                         const typename atomic<T, Allocator>::value_type arg,
                          const memory_order order) noexcept;
 
 /**
@@ -941,7 +991,7 @@ atomic_fetch_or_explicit(atomic<T, Allocator>* obj,
  */
 template <typename T, typename Allocator>
 STDGPU_DEVICE_ONLY T
-atomic_fetch_xor(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>::difference_type arg) noexcept;
+atomic_fetch_xor(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>::value_type arg) noexcept;
 
 /**
  * \ingroup atomic
@@ -954,7 +1004,7 @@ atomic_fetch_xor(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>:
 template <typename T, typename Allocator>
 STDGPU_DEVICE_ONLY T
 atomic_fetch_xor_explicit(atomic<T, Allocator>* obj,
-                          const typename atomic<T, Allocator>::difference_type arg,
+                          const typename atomic<T, Allocator>::value_type arg,
                           const memory_order order) noexcept;
 
 } // namespace stdgpu
diff --git a/src/stdgpu/impl/atomic_detail.cuh b/src/stdgpu/impl/atomic_detail.cuh
index fdf6e895d..09f7f8d8c 100644
--- a/src/stdgpu/impl/atomic_detail.cuh
+++ b/src/stdgpu/impl/atomic_detail.cuh
@@ -265,43 +265,43 @@ atomic<T, Allocator>::fetch_sub(const T arg, const memory_order order) noexcept
 }
 
 template <typename T, typename Allocator>
-template <STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(std::is_integral_v<T>)>
+template <STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(std::is_integral_v<T> || std::is_floating_point_v<T>)>
 inline STDGPU_DEVICE_ONLY T
-atomic<T, Allocator>::fetch_and(const T arg, const memory_order order) noexcept
+atomic<T, Allocator>::fetch_min(const T arg, const memory_order order) noexcept
 {
-    return _value_ref.fetch_and(arg, order);
+    return _value_ref.fetch_min(arg, order);
 }
 
 template <typename T, typename Allocator>
-template <STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(std::is_integral_v<T>)>
+template <STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(std::is_integral_v<T> || std::is_floating_point_v<T>)>
 inline STDGPU_DEVICE_ONLY T
-atomic<T, Allocator>::fetch_or(const T arg, const memory_order order) noexcept
+atomic<T, Allocator>::fetch_max(const T arg, const memory_order order) noexcept
 {
-    return _value_ref.fetch_or(arg, order);
+    return _value_ref.fetch_max(arg, order);
 }
 
 template <typename T, typename Allocator>
 template <STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(std::is_integral_v<T>)>
 inline STDGPU_DEVICE_ONLY T
-atomic<T, Allocator>::fetch_xor(const T arg, const memory_order order) noexcept
+atomic<T, Allocator>::fetch_and(const T arg, const memory_order order) noexcept
 {
-    return _value_ref.fetch_xor(arg, order);
+    return _value_ref.fetch_and(arg, order);
 }
 
 template <typename T, typename Allocator>
-template <STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(std::is_integral_v<T> || std::is_floating_point_v<T>)>
+template <STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(std::is_integral_v<T>)>
 inline STDGPU_DEVICE_ONLY T
-atomic<T, Allocator>::fetch_min(const T arg, const memory_order order) noexcept
+atomic<T, Allocator>::fetch_or(const T arg, const memory_order order) noexcept
 {
-    return _value_ref.fetch_min(arg, order);
+    return _value_ref.fetch_or(arg, order);
 }
 
 template <typename T, typename Allocator>
-template <STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(std::is_integral_v<T> || std::is_floating_point_v<T>)>
+template <STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(std::is_integral_v<T>)>
 inline STDGPU_DEVICE_ONLY T
-atomic<T, Allocator>::fetch_max(const T arg, const memory_order order) noexcept
+atomic<T, Allocator>::fetch_xor(const T arg, const memory_order order) noexcept
 {
-    return _value_ref.fetch_max(arg, order);
+    return _value_ref.fetch_xor(arg, order);
 }
 
 template <typename T, typename Allocator>
@@ -541,13 +541,13 @@ atomic_ref<T>::fetch_sub(const T arg, const memory_order order) noexcept
 }
 
 template <typename T>
-template <STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(std::is_integral_v<T>)>
+template <STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(std::is_integral_v<T> || std::is_floating_point_v<T>)>
 inline STDGPU_DEVICE_ONLY T
-atomic_ref<T>::fetch_and(const T arg, const memory_order order) noexcept
+atomic_ref<T>::fetch_min(const T arg, const memory_order order) noexcept
 {
     detail::atomic_load_thread_fence(order);
 
-    T result = stdgpu::STDGPU_BACKEND_NAMESPACE::atomic_fetch_and(_value, arg);
+    T result = stdgpu::STDGPU_BACKEND_NAMESPACE::atomic_fetch_min(_value, arg);
 
     detail::atomic_store_thread_fence(order);
 
@@ -555,13 +555,13 @@ atomic_ref<T>::fetch_and(const T arg, const memory_order order) noexcept
 }
 
 template <typename T>
-template <STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(std::is_integral_v<T>)>
+template <STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(std::is_integral_v<T> || std::is_floating_point_v<T>)>
 inline STDGPU_DEVICE_ONLY T
-atomic_ref<T>::fetch_or(const T arg, const memory_order order) noexcept
+atomic_ref<T>::fetch_max(const T arg, const memory_order order) noexcept
 {
     detail::atomic_load_thread_fence(order);
 
-    T result = stdgpu::STDGPU_BACKEND_NAMESPACE::atomic_fetch_or(_value, arg);
+    T result = stdgpu::STDGPU_BACKEND_NAMESPACE::atomic_fetch_max(_value, arg);
 
     detail::atomic_store_thread_fence(order);
 
@@ -571,11 +571,11 @@ atomic_ref<T>::fetch_or(const T arg, const memory_order order) noexcept
 template <typename T>
 template <STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(std::is_integral_v<T>)>
 inline STDGPU_DEVICE_ONLY T
-atomic_ref<T>::fetch_xor(const T arg, const memory_order order) noexcept
+atomic_ref<T>::fetch_and(const T arg, const memory_order order) noexcept
 {
     detail::atomic_load_thread_fence(order);
 
-    T result = stdgpu::STDGPU_BACKEND_NAMESPACE::atomic_fetch_xor(_value, arg);
+    T result = stdgpu::STDGPU_BACKEND_NAMESPACE::atomic_fetch_and(_value, arg);
 
     detail::atomic_store_thread_fence(order);
 
@@ -583,13 +583,13 @@ atomic_ref<T>::fetch_xor(const T arg, const memory_order order) noexcept
 }
 
 template <typename T>
-template <STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(std::is_integral_v<T> || std::is_floating_point_v<T>)>
+template <STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(std::is_integral_v<T>)>
 inline STDGPU_DEVICE_ONLY T
-atomic_ref<T>::fetch_min(const T arg, const memory_order order) noexcept
+atomic_ref<T>::fetch_or(const T arg, const memory_order order) noexcept
 {
     detail::atomic_load_thread_fence(order);
 
-    T result = stdgpu::STDGPU_BACKEND_NAMESPACE::atomic_fetch_min(_value, arg);
+    T result = stdgpu::STDGPU_BACKEND_NAMESPACE::atomic_fetch_or(_value, arg);
 
     detail::atomic_store_thread_fence(order);
 
@@ -597,13 +597,13 @@ atomic_ref<T>::fetch_min(const T arg, const memory_order order) noexcept
 }
 
 template <typename T>
-template <STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(std::is_integral_v<T> || std::is_floating_point_v<T>)>
+template <STDGPU_DETAIL_OVERLOAD_DEFINITION_IF(std::is_integral_v<T>)>
 inline STDGPU_DEVICE_ONLY T
-atomic_ref<T>::fetch_max(const T arg, const memory_order order) noexcept
+atomic_ref<T>::fetch_xor(const T arg, const memory_order order) noexcept
 {
     detail::atomic_load_thread_fence(order);
 
-    T result = stdgpu::STDGPU_BACKEND_NAMESPACE::atomic_fetch_max(_value, arg);
+    T result = stdgpu::STDGPU_BACKEND_NAMESPACE::atomic_fetch_xor(_value, arg);
 
     detail::atomic_store_thread_fence(order);
 
@@ -815,7 +815,39 @@ atomic_fetch_sub_explicit(atomic<T, Allocator>* obj,
 
 template <typename T, typename Allocator>
 inline STDGPU_DEVICE_ONLY T
-atomic_fetch_and(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>::difference_type arg) noexcept
+atomic_fetch_min(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>::value_type arg) noexcept
+{
+    return obj->fetch_min(arg);
+}
+
+template <typename T, typename Allocator>
+inline STDGPU_DEVICE_ONLY T
+atomic_fetch_min_explicit(atomic<T, Allocator>* obj,
+                          const typename atomic<T, Allocator>::value_type arg,
+                          const memory_order order) noexcept
+{
+    return obj->fetch_min(arg, order);
+}
+
+template <typename T, typename Allocator>
+inline STDGPU_DEVICE_ONLY T
+atomic_fetch_max(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>::value_type arg) noexcept
+{
+    return obj->fetch_max(arg);
+}
+
+template <typename T, typename Allocator>
+inline STDGPU_DEVICE_ONLY T
+atomic_fetch_max_explicit(atomic<T, Allocator>* obj,
+                          const typename atomic<T, Allocator>::value_type arg,
+                          const memory_order order) noexcept
+{
+    return obj->fetch_max(arg, order);
+}
+
+template <typename T, typename Allocator>
+inline STDGPU_DEVICE_ONLY T
+atomic_fetch_and(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>::value_type arg) noexcept
 {
     return obj->fetch_and(arg);
 }
@@ -823,7 +855,7 @@ atomic_fetch_and(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>:
 template <typename T, typename Allocator>
 inline STDGPU_DEVICE_ONLY T
 atomic_fetch_and_explicit(atomic<T, Allocator>* obj,
-                          const typename atomic<T, Allocator>::difference_type arg,
+                          const typename atomic<T, Allocator>::value_type arg,
                           const memory_order order) noexcept
 {
     return obj->fetch_and(arg, order);
@@ -831,7 +863,7 @@ atomic_fetch_and_explicit(atomic<T, Allocator>* obj,
 
 template <typename T, typename Allocator>
 inline STDGPU_DEVICE_ONLY T
-atomic_fetch_or(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>::difference_type arg) noexcept
+atomic_fetch_or(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>::value_type arg) noexcept
 {
     return obj->fetch_or(arg);
 }
@@ -839,7 +871,7 @@ atomic_fetch_or(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>::
 template <typename T, typename Allocator>
 inline STDGPU_DEVICE_ONLY T
 atomic_fetch_or_explicit(atomic<T, Allocator>* obj,
-                         const typename atomic<T, Allocator>::difference_type arg,
+                         const typename atomic<T, Allocator>::value_type arg,
                          const memory_order order) noexcept
 {
     return obj->fetch_or(arg, order);
@@ -847,7 +879,7 @@ atomic_fetch_or_explicit(atomic<T, Allocator>* obj,
 
 template <typename T, typename Allocator>
 inline STDGPU_DEVICE_ONLY T
-atomic_fetch_xor(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>::difference_type arg) noexcept
+atomic_fetch_xor(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>::value_type arg) noexcept
 {
     return obj->fetch_xor(arg);
 }
@@ -855,7 +887,7 @@ atomic_fetch_xor(atomic<T, Allocator>* obj, const typename atomic<T, Allocator>:
 template <typename T, typename Allocator>
 inline STDGPU_DEVICE_ONLY T
 atomic_fetch_xor_explicit(atomic<T, Allocator>* obj,
-                          const typename atomic<T, Allocator>::difference_type arg,
+                          const typename atomic<T, Allocator>::value_type arg,
                           const memory_order order) noexcept
 {
     return obj->fetch_xor(arg, order);
diff --git a/tests/stdgpu/atomic.inc b/tests/stdgpu/atomic.inc
index ebbe745ab..7faddf724 100644
--- a/tests/stdgpu/atomic.inc
+++ b/tests/stdgpu/atomic.inc
@@ -1125,6 +1125,260 @@ TEST_F(stdgpu_atomic, operator_sub_equals_unsigned_long_long_int)
     sequence_operator_sub_equals<unsigned long long int>();
 }
 
+template <typename T>
+class min_sequence
+{
+public:
+    min_sequence(const stdgpu::atomic<T>& value, T* sequence)
+      : _value(value)
+      , _sequence(sequence)
+    {
+    }
+
+    STDGPU_DEVICE_ONLY void
+    operator()(const stdgpu::index_t i)
+    {
+        _value.fetch_min(_sequence[i]);
+    }
+
+private:
+    stdgpu::atomic<T> _value;
+    T* _sequence;
+};
+
+template <typename T>
+class min_sequence_nonmember
+{
+public:
+    min_sequence_nonmember(const stdgpu::atomic<T>& value, T* sequence)
+      : _value(value)
+      , _sequence(sequence)
+    {
+    }
+
+    STDGPU_DEVICE_ONLY void
+    operator()(const stdgpu::index_t i)
+    {
+        stdgpu::atomic_fetch_min(&_value, _sequence[i]);
+    }
+
+private:
+    stdgpu::atomic<T> _value;
+    T* _sequence;
+};
+
+template <typename T>
+class min_sequence_nonmember_explicit
+{
+public:
+    min_sequence_nonmember_explicit(const stdgpu::atomic<T>& value, T* sequence)
+      : _value(value)
+      , _sequence(sequence)
+    {
+    }
+
+    STDGPU_DEVICE_ONLY void
+    operator()(const stdgpu::index_t i)
+    {
+        stdgpu::atomic_fetch_min_explicit(&_value, _sequence[i], stdgpu::memory_order_relaxed);
+    }
+
+private:
+    stdgpu::atomic<T> _value;
+    T* _sequence;
+};
+
+template <typename T, template <typename> class Function>
+void
+sequence_fetch_min()
+{
+    const stdgpu::index_t N = 40000;
+    T* sequence = createDeviceArray<T>(N);
+    stdgpu::iota(stdgpu::execution::device, stdgpu::device_begin(sequence), stdgpu::device_end(sequence), T(1));
+
+    stdgpu::atomic<T> value = stdgpu::atomic<T>::createDeviceObject();
+    value.store(std::numeric_limits<T>::max());
+
+    stdgpu::for_each_index(stdgpu::execution::device, N, Function<T>(value, sequence));
+
+    EXPECT_EQ(value.load(), T(1));
+
+    destroyDeviceArray<T>(sequence);
+    stdgpu::atomic<T>::destroyDeviceObject(value);
+}
+
+TEST_F(stdgpu_atomic, fetch_min_int)
+{
+    sequence_fetch_min<int, min_sequence>();
+}
+
+TEST_F(stdgpu_atomic, fetch_min_unsigned_int)
+{
+    sequence_fetch_min<unsigned int, min_sequence>();
+}
+
+TEST_F(stdgpu_atomic, fetch_min_unsigned_long_long_int)
+{
+    sequence_fetch_min<unsigned long long int, min_sequence>();
+}
+
+TEST_F(stdgpu_atomic, fetch_min_nonmember_int)
+{
+    sequence_fetch_min<int, min_sequence_nonmember>();
+}
+
+TEST_F(stdgpu_atomic, fetch_min_nonmember_unsigned_int)
+{
+    sequence_fetch_min<unsigned int, min_sequence_nonmember>();
+}
+
+TEST_F(stdgpu_atomic, fetch_min_nonmember_unsigned_long_long_int)
+{
+    sequence_fetch_min<unsigned long long int, min_sequence_nonmember>();
+}
+
+TEST_F(stdgpu_atomic, fetch_min_nonmember_explicit_int)
+{
+    sequence_fetch_min<int, min_sequence_nonmember_explicit>();
+}
+
+TEST_F(stdgpu_atomic, fetch_min_nonmember_explicit_unsigned_int)
+{
+    sequence_fetch_min<unsigned int, min_sequence_nonmember_explicit>();
+}
+
+TEST_F(stdgpu_atomic, fetch_min_nonmember_explicit_unsigned_long_long_int)
+{
+    sequence_fetch_min<unsigned long long int, min_sequence_nonmember_explicit>();
+}
+
+template <typename T>
+class max_sequence
+{
+public:
+    max_sequence(const stdgpu::atomic<T>& value, T* sequence)
+      : _value(value)
+      , _sequence(sequence)
+    {
+    }
+
+    STDGPU_DEVICE_ONLY void
+    operator()(const stdgpu::index_t i)
+    {
+        _value.fetch_max(_sequence[i]);
+    }
+
+private:
+    stdgpu::atomic<T> _value;
+    T* _sequence;
+};
+
+template <typename T>
+class max_sequence_nonmember
+{
+public:
+    max_sequence_nonmember(const stdgpu::atomic<T>& value, T* sequence)
+      : _value(value)
+      , _sequence(sequence)
+    {
+    }
+
+    STDGPU_DEVICE_ONLY void
+    operator()(const stdgpu::index_t i)
+    {
+        stdgpu::atomic_fetch_max(&_value, _sequence[i]);
+    }
+
+private:
+    stdgpu::atomic<T> _value;
+    T* _sequence;
+};
+
+template <typename T>
+class max_sequence_nonmember_explicit
+{
+public:
+    max_sequence_nonmember_explicit(const stdgpu::atomic<T>& value, T* sequence)
+      : _value(value)
+      , _sequence(sequence)
+    {
+    }
+
+    STDGPU_DEVICE_ONLY void
+    operator()(const stdgpu::index_t i)
+    {
+        stdgpu::atomic_fetch_max_explicit(&_value, _sequence[i], stdgpu::memory_order_relaxed);
+    }
+
+private:
+    stdgpu::atomic<T> _value;
+    T* _sequence;
+};
+
+template <typename T, template <typename> class Function>
+void
+sequence_fetch_max()
+{
+    const stdgpu::index_t N = 40000;
+    T* sequence = createDeviceArray<T>(N);
+    stdgpu::iota(stdgpu::execution::device, stdgpu::device_begin(sequence), stdgpu::device_end(sequence), T(1));
+
+    stdgpu::atomic<T> value = stdgpu::atomic<T>::createDeviceObject();
+    value.store(std::numeric_limits<T>::lowest());
+
+    stdgpu::for_each_index(stdgpu::execution::device, N, Function<T>(value, sequence));
+
+    EXPECT_EQ(value.load(), T(N));
+
+    destroyDeviceArray<T>(sequence);
+    stdgpu::atomic<T>::destroyDeviceObject(value);
+}
+
+TEST_F(stdgpu_atomic, fetch_max_int)
+{
+    sequence_fetch_max<int, max_sequence>();
+}
+
+TEST_F(stdgpu_atomic, fetch_max_unsigned_int)
+{
+    sequence_fetch_max<unsigned int, max_sequence>();
+}
+
+TEST_F(stdgpu_atomic, fetch_max_unsigned_long_long_int)
+{
+    sequence_fetch_max<unsigned long long int, max_sequence>();
+}
+
+TEST_F(stdgpu_atomic, fetch_max_nonmember_int)
+{
+    sequence_fetch_max<int, max_sequence_nonmember>();
+}
+
+TEST_F(stdgpu_atomic, fetch_max_nonmember_unsigned_int)
+{
+    sequence_fetch_max<unsigned int, max_sequence_nonmember>();
+}
+
+TEST_F(stdgpu_atomic, fetch_max_nonmember_unsigned_long_long_int)
+{
+    sequence_fetch_max<unsigned long long int, max_sequence_nonmember>();
+}
+
+TEST_F(stdgpu_atomic, fetch_max_nonmember_explicit_int)
+{
+    sequence_fetch_max<int, max_sequence_nonmember_explicit>();
+}
+
+TEST_F(stdgpu_atomic, fetch_max_nonmember_explicit_unsigned_int)
+{
+    sequence_fetch_max<unsigned int, max_sequence_nonmember_explicit>();
+}
+
+TEST_F(stdgpu_atomic, fetch_max_nonmember_explicit_unsigned_long_long_int)
+{
+    sequence_fetch_max<unsigned long long int, max_sequence_nonmember_explicit>();
+}
+
 template <typename T>
 bool
 bit_set(const T value, const stdgpu::index_t bit_position)
@@ -1706,116 +1960,6 @@ TEST_F(stdgpu_atomic, operator_xor_equals_unsigned_long_long_int)
     sequence_operator_xor_equals<unsigned long long int>();
 }
 
-template <typename T>
-class min_sequence
-{
-public:
-    min_sequence(const stdgpu::atomic<T>& value, T* sequence)
-      : _value(value)
-      , _sequence(sequence)
-    {
-    }
-
-    STDGPU_DEVICE_ONLY void
-    operator()(const stdgpu::index_t i)
-    {
-        _value.fetch_min(_sequence[i]);
-    }
-
-private:
-    stdgpu::atomic<T> _value;
-    T* _sequence;
-};
-
-template <typename T>
-void
-sequence_fetch_min()
-{
-    const stdgpu::index_t N = 40000;
-    T* sequence = createDeviceArray<T>(N);
-    stdgpu::iota(stdgpu::execution::device, stdgpu::device_begin(sequence), stdgpu::device_end(sequence), T(1));
-
-    stdgpu::atomic<T> value = stdgpu::atomic<T>::createDeviceObject();
-    value.store(std::numeric_limits<T>::max());
-
-    stdgpu::for_each_index(stdgpu::execution::device, N, min_sequence<T>(value, sequence));
-
-    EXPECT_EQ(value.load(), T(1));
-
-    destroyDeviceArray<T>(sequence);
-    stdgpu::atomic<T>::destroyDeviceObject(value);
-}
-
-TEST_F(stdgpu_atomic, fetch_min_int)
-{
-    sequence_fetch_min<int>();
-}
-
-TEST_F(stdgpu_atomic, fetch_min_unsigned_int)
-{
-    sequence_fetch_min<unsigned int>();
-}
-
-TEST_F(stdgpu_atomic, fetch_min_unsigned_long_long_int)
-{
-    sequence_fetch_min<unsigned long long int>();
-}
-
-template <typename T>
-class max_sequence
-{
-public:
-    max_sequence(const stdgpu::atomic<T>& value, T* sequence)
-      : _value(value)
-      , _sequence(sequence)
-    {
-    }
-
-    STDGPU_DEVICE_ONLY void
-    operator()(const stdgpu::index_t i)
-    {
-        _value.fetch_max(_sequence[i]);
-    }
-
-private:
-    stdgpu::atomic<T> _value;
-    T* _sequence;
-};
-
-template <typename T>
-void
-sequence_fetch_max()
-{
-    const stdgpu::index_t N = 40000;
-    T* sequence = createDeviceArray<T>(N);
-    stdgpu::iota(stdgpu::execution::device, stdgpu::device_begin(sequence), stdgpu::device_end(sequence), T(1));
-
-    stdgpu::atomic<T> value = stdgpu::atomic<T>::createDeviceObject();
-    value.store(std::numeric_limits<T>::lowest());
-
-    stdgpu::for_each_index(stdgpu::execution::device, N, max_sequence<T>(value, sequence));
-
-    EXPECT_EQ(value.load(), T(N));
-
-    destroyDeviceArray<T>(sequence);
-    stdgpu::atomic<T>::destroyDeviceObject(value);
-}
-
-TEST_F(stdgpu_atomic, fetch_max_int)
-{
-    sequence_fetch_max<int>();
-}
-
-TEST_F(stdgpu_atomic, fetch_max_unsigned_int)
-{
-    sequence_fetch_max<unsigned int>();
-}
-
-TEST_F(stdgpu_atomic, fetch_max_unsigned_long_long_int)
-{
-    sequence_fetch_max<unsigned long long int>();
-}
-
 template <typename T>
 class inc_mod_sequence
 {