diff --git a/sycl/include/sycl/access/access.hpp b/sycl/include/sycl/access/access.hpp
index ca8bda18eecb7..3e6820a521643 100644
--- a/sycl/include/sycl/access/access.hpp
+++ b/sycl/include/sycl/access/access.hpp
@@ -256,10 +256,44 @@ struct deduce_AS
 };
 #endif
 
+template <typename T> struct remove_decoration_impl {
+  using type = T;
+};
+
+#ifdef __SYCL_DEVICE_ONLY__
+template <typename T> struct remove_decoration_impl<__OPENCL_GLOBAL_AS__ T> {
+  using type = T;
+};
+
+#ifdef __ENABLE_USM_ADDR_SPACE__
+template <typename T>
+struct remove_decoration_impl<__OPENCL_GLOBAL_DEVICE_AS__ T> {
+  using type = T;
+};
+
+template <typename T>
+struct remove_decoration_impl<__OPENCL_GLOBAL_HOST_AS__ T> {
+  using type = T;
+};
+
+#endif // __ENABLE_USM_ADDR_SPACE__
+
+template <typename T> struct remove_decoration_impl<__OPENCL_PRIVATE_AS__ T> {
+  using type = T;
+};
+
+template <typename T> struct remove_decoration_impl<__OPENCL_LOCAL_AS__ T> {
+  using type = T;
+};
+
+template <typename T> struct remove_decoration_impl<__OPENCL_CONSTANT_AS__ T> {
+  using type = T;
+};
+#endif // __SYCL_DEVICE_ONLY__
 } // namespace detail
 
 template <typename T> struct remove_decoration {
-  using type = T;
+  using type = typename detail::remove_decoration_impl<T>::type;
 };
 
 // Propagate through const qualifier.
@@ -287,35 +321,6 @@ template <typename T> struct remove_decoration<const T &> {
   using type = const typename remove_decoration<T>::type &;
 };
 
-#ifdef __SYCL_DEVICE_ONLY__
-template <typename T> struct remove_decoration<__OPENCL_GLOBAL_AS__ T> {
-  using type = T;
-};
-
-#ifdef __ENABLE_USM_ADDR_SPACE__
-template <typename T> struct remove_decoration<__OPENCL_GLOBAL_DEVICE_AS__ T> {
-  using type = T;
-};
-
-template <typename T> struct remove_decoration<__OPENCL_GLOBAL_HOST_AS__ T> {
-  using type = T;
-};
-
-#endif // __ENABLE_USM_ADDR_SPACE__
-
-template <typename T> struct remove_decoration<__OPENCL_PRIVATE_AS__ T> {
-  using type = T;
-};
-
-template <typename T> struct remove_decoration<__OPENCL_LOCAL_AS__ T> {
-  using type = T;
-};
-
-template <typename T> struct remove_decoration<__OPENCL_CONSTANT_AS__ T> {
-  using type = T;
-};
-#endif // __SYCL_DEVICE_ONLY__
-
 template <typename T>
 using remove_decoration_t = typename remove_decoration<T>::type;
 
diff --git a/sycl/include/sycl/detail/builtins.hpp b/sycl/include/sycl/detail/builtins.hpp
index 69bde7c7b0d93..99fc593156d06 100644
--- a/sycl/include/sycl/detail/builtins.hpp
+++ b/sycl/include/sycl/detail/builtins.hpp
@@ -9,7 +9,7 @@
 #pragma once
 
 #include <sycl/detail/defines_elementary.hpp>  // for __SYCL_ALWAYS_INLINE
-#include <sycl/detail/generic_type_traits.hpp> // for convertDataToType
+#include <sycl/detail/generic_type_traits.hpp> // for to/from OpenCLType converts
 #include <utility>
 
 // TODO Decide whether to mark functions with this attribute.
@@ -18,20 +18,31 @@
 #ifdef __SYCL_DEVICE_ONLY__
 #define __FUNC_PREFIX_OCL __spirv_ocl_
 #define __FUNC_PREFIX_CORE __spirv_
-#define __SYCL_EXTERN_IT1(Ret, prefix, call, Arg1)
-#define __SYCL_EXTERN_IT2(Ret, prefix, call, Arg1, Arg2)
-#define __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, Arg)
-#define __SYCL_EXTERN_IT3(Ret, prefix, call, Arg1, Arg2, Arg3)
+#define __SYCL_EXTERN_IT1(R, prefix, call, arg1)
+#define __SYCL_EXTERN_IT2(R, prefix, call, arg1, arg2)
+#define __SYCL_EXTERN_IT2_SAME(R, prefix, call, arg)
+#define __SYCL_EXTERN_IT3(R, prefix, call, Arg1, Arg2, Arg3)
 #else
 #define __FUNC_PREFIX_OCL sycl_host_
 #define __FUNC_PREFIX_CORE sycl_host_
-#define __SYCL_EXTERN_IT1(Ret, prefix, call, Arg)                              \
+#define __SYCL_EXTERN_IT1(R, prefix, call, arg)                                \
+  using Arg = decltype(arg);                                                   \
+  using Ret = sycl::detail::ConvertToOpenCLType_t<R>;                          \
   extern Ret __SYCL_PPCAT(prefix, call)(Arg)
-#define __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, Arg)                         \
+#define __SYCL_EXTERN_IT2_SAME(R, prefix, call, arg)                           \
+  using Arg = decltype(arg);                                                   \
+  using Ret = sycl::detail::ConvertToOpenCLType_t<R>;                          \
   extern Ret __SYCL_PPCAT(prefix, call)(Arg, Arg)
-#define __SYCL_EXTERN_IT2(Ret, prefix, call, Arg1, Arg2)                       \
+#define __SYCL_EXTERN_IT2(R, prefix, call, arg1, arg2)                         \
+  using Arg1 = decltype(arg1);                                                 \
+  using Arg2 = decltype(arg2);                                                 \
+  using Ret = sycl::detail::ConvertToOpenCLType_t<R>;                          \
   extern Ret __SYCL_PPCAT(prefix, call)(Arg1, Arg2)
-#define __SYCL_EXTERN_IT3(Ret, prefix, call, Arg1, Arg2, Arg3)                 \
+#define __SYCL_EXTERN_IT3(R, prefix, call, arg1, arg2, arg3)                   \
+  using Arg1 = decltype(arg1);                                                 \
+  using Arg2 = decltype(arg2);                                                 \
+  using Arg3 = decltype(arg3);                                                 \
+  using Ret = sycl::detail::ConvertToOpenCLType_t<R>;                          \
   extern Ret __SYCL_PPCAT(prefix, call)(Arg1, Arg2, Arg3)
 #endif
 
@@ -41,64 +52,51 @@
 #define __SYCL_MAKE_CALL_ARG1(call, prefix)                                    \
   template <typename R, typename T1>                                           \
   inline __SYCL_ALWAYS_INLINE R __invoke_##call(T1 t1) __NOEXC {               \
-    using Ret = sycl::detail::ConvertToOpenCLType_t<R>;                        \
-    using Arg1 = sycl::detail::ConvertToOpenCLType_t<T1>;                      \
-    __SYCL_EXTERN_IT1(Ret, prefix, call, Arg1);                                \
-    Arg1 arg1 = sycl::detail::convertDataToType<T1, Arg1>(std::move(t1));      \
-    Ret ret = __SYCL_PPCAT(prefix, call)(std::move(arg1));                     \
-    return sycl::detail::convertDataToType<Ret, R>(std::move(ret));            \
+    auto arg1 = sycl::detail::convertToOpenCLType(std::move(t1));              \
+    __SYCL_EXTERN_IT1(R, prefix, call, arg1);                                  \
+    return sycl::detail::convertFromOpenCLTypeFor<R>(                          \
+        __SYCL_PPCAT(prefix, call)(std::move(arg1)));                          \
   }
 
 #define __SYCL_MAKE_CALL_ARG2(call, prefix)                                    \
   template <typename R, typename T1, typename T2>                              \
   inline __SYCL_ALWAYS_INLINE R __invoke_##call(T1 t1, T2 t2) __NOEXC {        \
-    using Ret = sycl::detail::ConvertToOpenCLType_t<R>;                        \
-    using Arg1 = sycl::detail::ConvertToOpenCLType_t<T1>;                      \
-    using Arg2 = sycl::detail::ConvertToOpenCLType_t<T2>;                      \
-    __SYCL_EXTERN_IT2(Ret, prefix, call, Arg1, Arg2);                          \
-    Arg1 arg1 = sycl::detail::convertDataToType<T1, Arg1>(std::move(t1));      \
-    Arg2 arg2 = sycl::detail::convertDataToType<T2, Arg2>(std::move(t2));      \
-    Ret ret = __SYCL_PPCAT(prefix, call)(std::move(arg1), std::move(arg2));    \
-    return sycl::detail::convertDataToType<Ret, R>(std::move(ret));            \
+    auto arg1 = sycl::detail::convertToOpenCLType(std::move(t1));              \
+    auto arg2 = sycl::detail::convertToOpenCLType(std::move(t2));              \
+    __SYCL_EXTERN_IT2(R, prefix, call, arg1, arg2);                            \
+    return sycl::detail::convertFromOpenCLTypeFor<R>(                          \
+        __SYCL_PPCAT(prefix, call)(std::move(arg1), std::move(arg2)));         \
   }
 
 #define __SYCL_MAKE_CALL_ARG2_SAME(call, prefix)                               \
   template <typename R, typename T>                                            \
   inline __SYCL_ALWAYS_INLINE R __invoke_##call(T t1, T t2) __NOEXC {          \
-    using Ret = sycl::detail::ConvertToOpenCLType_t<R>;                        \
-    using Arg = sycl::detail::ConvertToOpenCLType_t<T>;                        \
-    __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, Arg);                            \
-    Arg arg1 = sycl::detail::convertDataToType<T, Arg>(std::move(t1));         \
-    Arg arg2 = sycl::detail::convertDataToType<T, Arg>(std::move(t2));         \
-    Ret ret = __SYCL_PPCAT(prefix, call)(std::move(arg1), std::move(arg2));    \
-    return sycl::detail::convertDataToType<Ret, R>(std::move(ret));            \
+    auto arg1 = sycl::detail::convertToOpenCLType(std::move(t1));              \
+    auto arg2 = sycl::detail::convertToOpenCLType(std::move(t2));              \
+    __SYCL_EXTERN_IT2_SAME(R, prefix, call, arg1);                             \
+    return sycl::detail::convertFromOpenCLTypeFor<R>(                          \
+        __SYCL_PPCAT(prefix, call)(std::move(arg1), std::move(arg2)));         \
   }
 
 #define __SYCL_MAKE_CALL_ARG2_SAME_RESULT(call, prefix)                        \
   template <typename T>                                                        \
   inline __SYCL_ALWAYS_INLINE T __invoke_##call(T v1, T v2) __NOEXC {          \
-    using Type = sycl::detail::ConvertToOpenCLType_t<T>;                       \
-    __SYCL_EXTERN_IT2_SAME(Type, prefix, call, Type);                          \
-    Type arg1 = sycl::detail::convertDataToType<T, Type>(std::move(v1));       \
-    Type arg2 = sycl::detail::convertDataToType<T, Type>(std::move(v2));       \
-    Type ret = __SYCL_PPCAT(prefix, call)(std::move(arg1), std::move(arg2));   \
-    return sycl::detail::convertDataToType<Type, T>(std::move(ret));           \
+    auto arg1 = sycl::detail::convertToOpenCLType(std::move(v1));              \
+    auto arg2 = sycl::detail::convertToOpenCLType(std::move(v2));              \
+    __SYCL_EXTERN_IT2_SAME(T, prefix, call, arg1);                             \
+    return sycl::detail::convertFromOpenCLTypeFor<T>(                          \
+        __SYCL_PPCAT(prefix, call)(std::move(arg1), std::move(arg2)));         \
   }
 
 #define __SYCL_MAKE_CALL_ARG3(call, prefix)                                    \
   template <typename R, typename T1, typename T2, typename T3>                 \
   inline __SYCL_ALWAYS_INLINE R __invoke_##call(T1 t1, T2 t2, T3 t3) __NOEXC { \
-    using Ret = sycl::detail::ConvertToOpenCLType_t<R>;                        \
-    using Arg1 = sycl::detail::ConvertToOpenCLType_t<T1>;                      \
-    using Arg2 = sycl::detail::ConvertToOpenCLType_t<T2>;                      \
-    using Arg3 = sycl::detail::ConvertToOpenCLType_t<T3>;                      \
-    __SYCL_EXTERN_IT3(Ret, prefix, call, Arg1, Arg2, Arg3);                    \
-    Arg1 arg1 = sycl::detail::convertDataToType<T1, Arg1>(std::move(t1));      \
-    Arg2 arg2 = sycl::detail::convertDataToType<T2, Arg2>(std::move(t2));      \
-    Arg3 arg3 = sycl::detail::convertDataToType<T3, Arg3>(std::move(t3));      \
-    Ret ret = __SYCL_PPCAT(prefix, call)(std::move(arg1), std::move(arg2),     \
-                                         std::move(arg3));                     \
-    return sycl::detail::convertDataToType<Ret, R>(std::move(ret));            \
+    auto arg1 = sycl::detail::convertToOpenCLType(std::move(t1));              \
+    auto arg2 = sycl::detail::convertToOpenCLType(std::move(t2));              \
+    auto arg3 = sycl::detail::convertToOpenCLType(std::move(t3));              \
+    __SYCL_EXTERN_IT3(R, prefix, call, arg1, arg2, arg3);                      \
+    return sycl::detail::convertFromOpenCLTypeFor<R>(__SYCL_PPCAT(             \
+        prefix, call)(std::move(arg1), std::move(arg2), std::move(arg3)));     \
   }
 
 #ifndef __SYCL_DEVICE_ONLY__
diff --git a/sycl/include/sycl/detail/generic_type_traits.hpp b/sycl/include/sycl/detail/generic_type_traits.hpp
index 36431fcdad13a..7cf893778394c 100644
--- a/sycl/include/sycl/detail/generic_type_traits.hpp
+++ b/sycl/include/sycl/detail/generic_type_traits.hpp
@@ -697,6 +697,25 @@ convertDataToType(FROM t) {
     return ConvertNonVectorType<TO>(t);
 }
 
+// Now fuse the above into a simpler helper that's easy to use.
+// TODO: That should probably be moved outside of "type_traits".
+template <typename T> auto convertToOpenCLType(T &&x) {
+  using OpenCLType = ConvertToOpenCLType_t<std::remove_reference_t<T>>;
+  return convertDataToType<T, OpenCLType>(std::forward<T>(x));
+}
+
+template <typename To, typename From> auto convertFromOpenCLTypeFor(From &&x) {
+  if constexpr (std::is_same_v<To, bool> &&
+                std::is_same_v<std::remove_reference_t<From>, bool>) {
+    // FIXME: Something seems to be wrong elsewhere...
+    return x;
+  } else {
+    static_assert(std::is_same_v<std::remove_reference_t<From>,
+                                 ConvertToOpenCLType_t<To>>);
+    return convertDataToType<From, To>(std::forward<From>(x));
+  }
+}
+
 // Used for all, any and select relational built-in functions
 template <typename T> inline constexpr T msbMask(T) {
   using UT = make_unsigned_t<T>;
diff --git a/sycl/include/sycl/detail/image_ocl_types.hpp b/sycl/include/sycl/detail/image_ocl_types.hpp
index adcedc6722297..1f3da07f83ea8 100644
--- a/sycl/include/sycl/detail/image_ocl_types.hpp
+++ b/sycl/include/sycl/detail/image_ocl_types.hpp
@@ -47,9 +47,8 @@ template <typename ImageType> struct sampled_opencl_image_type;
 #define __SYCL_INVOKE_SPIRV_CALL_ARG1(call)                                    \
   template <typename R, typename T1> inline R __invoke_##call(T1 ParT1) {      \
     using Ret = sycl::detail::ConvertToOpenCLType_t<R>;                        \
-    T1 Arg1 = ParT1;                                                           \
-    Ret RetVar = __spirv_##call<Ret, T1>(Arg1);                                \
-    return sycl::detail::convertDataToType<Ret, R>(RetVar);                    \
+    return sycl::detail::convertFromOpenCLTypeFor<R>(                          \
+        __spirv_##call<Ret, T1>(ParT1));                                       \
   }
 
 // The macro defines the function __invoke_ImageXXXX,
@@ -62,13 +61,11 @@ template <typename ImageT, typename CoordT, typename ValT>
 static void __invoke__ImageWrite(ImageT Img, CoordT Coords, ValT Val) {
 
   // Convert from sycl types to builtin types to get correct function mangling.
-  using TmpValT = sycl::detail::ConvertToOpenCLType_t<ValT>;
-  using TmpCoordT = sycl::detail::ConvertToOpenCLType_t<CoordT>;
+  auto TmpCoords = sycl::detail::convertToOpenCLType(Coords);
+  auto TmpVal = sycl::detail::convertToOpenCLType(Val);
 
-  TmpCoordT TmpCoord =
-      sycl::detail::convertDataToType<CoordT, TmpCoordT>(Coords);
-  TmpValT TmpVal = sycl::detail::convertDataToType<ValT, TmpValT>(Val);
-  __spirv_ImageWrite<ImageT, TmpCoordT, TmpValT>(Img, TmpCoord, TmpVal);
+  __spirv_ImageWrite<ImageT, decltype(TmpCoords), decltype(TmpVal)>(
+      Img, TmpCoords, TmpVal);
 }
 
 template <typename RetType, typename ImageT, typename CoordT>
@@ -76,11 +73,10 @@ static RetType __invoke__ImageRead(ImageT Img, CoordT Coords) {
 
   // Convert from sycl types to builtin types to get correct function mangling.
   using TempRetT = sycl::detail::ConvertToOpenCLType_t<RetType>;
-  using TempArgT = sycl::detail::ConvertToOpenCLType_t<CoordT>;
+  auto TmpCoords = sycl::detail::convertToOpenCLType(Coords);
 
-  TempArgT Arg = sycl::detail::convertDataToType<CoordT, TempArgT>(Coords);
-  TempRetT Ret = __spirv_ImageRead<TempRetT, ImageT, TempArgT>(Img, Arg);
-  return sycl::detail::convertDataToType<TempRetT, RetType>(Ret);
+  return sycl::detail::convertFromOpenCLTypeFor<RetType>(
+      __spirv_ImageRead<TempRetT, ImageT, decltype(TmpCoords)>(Img, TmpCoords));
 }
 
 template <typename RetType, typename ImageT, typename CoordT>
@@ -101,10 +97,7 @@ static RetType __invoke__ImageReadLod(SmpImageT SmpImg, CoordT Coords,
 
   // Convert from sycl types to builtin types to get correct function mangling.
   using TempRetT = sycl::detail::ConvertToOpenCLType_t<RetType>;
-  using TempArgT = sycl::detail::ConvertToOpenCLType_t<CoordT>;
-
-  TempArgT TmpCoords =
-      sycl::detail::convertDataToType<CoordT, TempArgT>(Coords);
+  auto TmpCoords = sycl::detail::convertToOpenCLType(Coords);
 
   enum ImageOperands { Lod = 0x2 };
 
@@ -114,9 +107,9 @@ static RetType __invoke__ImageReadLod(SmpImageT SmpImg, CoordT Coords,
   // Sampled Image must be an object whose type is OpTypeSampledImage
   // Image Operands encodes what operands follow. Either Lod
   // or Grad image operands must be present
-  TempRetT Ret = __spirv_ImageSampleExplicitLod<SmpImageT, TempRetT, TempArgT>(
-      SmpImg, TmpCoords, ImageOperands::Lod, Level);
-  return sycl::detail::convertDataToType<TempRetT, RetType>(Ret);
+  return sycl::detail::convertFromOpenCLTypeFor<RetType>(
+      __spirv_ImageSampleExplicitLod<SmpImageT, TempRetT, decltype(TmpCoords)>(
+          SmpImg, TmpCoords, ImageOperands::Lod, Level));
 }
 
 template <typename RetType, typename SmpImageT, typename CoordT>
@@ -125,12 +118,9 @@ static RetType __invoke__ImageReadGrad(SmpImageT SmpImg, CoordT Coords,
 
   // Convert from sycl types to builtin types to get correct function mangling.
   using TempRetT = sycl::detail::ConvertToOpenCLType_t<RetType>;
-  using TempArgT = sycl::detail::ConvertToOpenCLType_t<CoordT>;
-
-  TempArgT TmpCoords =
-      sycl::detail::convertDataToType<CoordT, TempArgT>(Coords);
-  TempArgT TmpGraddX = sycl::detail::convertDataToType<CoordT, TempArgT>(Dx);
-  TempArgT TmpGraddY = sycl::detail::convertDataToType<CoordT, TempArgT>(Dy);
+  auto TmpCoords = sycl::detail::convertToOpenCLType(Coords);
+  auto TmpGraddX = sycl::detail::convertToOpenCLType(Dx);
+  auto TmpGraddY = sycl::detail::convertToOpenCLType(Dy);
 
   enum ImageOperands { Grad = 0x3 };
 
@@ -140,9 +130,9 @@ static RetType __invoke__ImageReadGrad(SmpImageT SmpImg, CoordT Coords,
   // Sampled Image must be an object whose type is OpTypeSampledImage
   // Image Operands encodes what operands follow. Either Lod
   // or Grad image operands must be present
-  TempRetT Ret = __spirv_ImageSampleExplicitLod<SmpImageT, TempRetT, TempArgT>(
-      SmpImg, TmpCoords, ImageOperands::Grad, TmpGraddX, TmpGraddY);
-  return sycl::detail::convertDataToType<TempRetT, RetType>(Ret);
+  return sycl::detail::convertFromOpenCLTypeFor<RetType>(
+      __spirv_ImageSampleExplicitLod<SmpImageT, TempRetT, decltype(TmpCoords)>(
+          SmpImg, TmpCoords, ImageOperands::Grad, TmpGraddX, TmpGraddY));
 }
 
 template <typename RetType, typename ImageT, typename CoordT>
@@ -151,12 +141,10 @@ static RetType __invoke__ImageReadSampler(ImageT Img, CoordT Coords,
 
   // Convert from sycl types to builtin types to get correct function mangling.
   using TempRetT = sycl::detail::ConvertToOpenCLType_t<RetType>;
-  using TempArgT = sycl::detail::ConvertToOpenCLType_t<CoordT>;
   using SampledT =
       typename sycl::detail::sampled_opencl_image_type<ImageT>::type;
 
-  TempArgT TmpCoords =
-      sycl::detail::convertDataToType<CoordT, TempArgT>(Coords);
+  auto TmpCoords = sycl::detail::convertToOpenCLType(Coords);
   // According to validation rules(SPIR-V specification, section 2.16.1) result
   // of __spirv_SampledImage is allowed to be an operand of image lookup
   // and query instructions explicitly specified to take an operand whose
@@ -169,10 +157,10 @@ static RetType __invoke__ImageReadSampler(ImageT Img, CoordT Coords,
   enum ImageOperands { Lod = 0x2 };
 
   // Lod value is zero as mipmap is not supported.
-  TempRetT Ret = __spirv_ImageSampleExplicitLod<SampledT, TempRetT, TempArgT>(
-      __spirv_SampledImage<ImageT, SampledT>(Img, Smpl), TmpCoords,
-      ImageOperands::Lod, 0.0f);
-  return sycl::detail::convertDataToType<TempRetT, RetType>(Ret);
+  return sycl::detail::convertFromOpenCLTypeFor<RetType>(
+      __spirv_ImageSampleExplicitLod<SampledT, TempRetT, decltype(TmpCoords)>(
+          __spirv_SampledImage<ImageT, SampledT>(Img, Smpl), TmpCoords,
+          ImageOperands::Lod, 0.0f));
 }
 
 namespace sycl {
diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index ee80a50db417b..54bb7d229c372 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -256,14 +256,11 @@ template <> struct GroupId<::sycl::sub_group> {
 };
 template <typename Group, typename T, typename IdT>
 EnableIfNativeBroadcast<T, IdT> GroupBroadcast(Group, T x, IdT local_id) {
-  using GroupIdT = typename GroupId<Group>::type;
-  GroupIdT GroupLocalId = static_cast<GroupIdT>(local_id);
-  using OCLT = detail::ConvertToOpenCLType_t<T>;
-  using WidenedT = WidenOpenCLTypeTo32_t<OCLT>;
-  using OCLIdT = detail::ConvertToOpenCLType_t<GroupIdT>;
-  WidenedT OCLX = detail::convertDataToType<T, OCLT>(x);
-  OCLIdT OCLId = detail::convertDataToType<GroupIdT, OCLIdT>(GroupLocalId);
-  return __spirv_GroupBroadcast(group_scope<Group>::value, OCLX, OCLId);
+  auto GroupLocalId = static_cast<typename GroupId<Group>::type>(local_id);
+  auto OCLX = detail::convertToOpenCLType(x);
+  WidenOpenCLTypeTo32_t<decltype(OCLX)> WideOCLX = OCLX;
+  auto OCLId = detail::convertToOpenCLType(GroupLocalId);
+  return __spirv_GroupBroadcast(group_scope<Group>::value, WideOCLX, OCLId);
 }
 template <typename ParentGroup, typename T, typename IdT>
 EnableIfNativeBroadcast<T, IdT>
@@ -273,23 +270,20 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
   auto LocalId = detail::IdToMaskPosition(g, local_id);
 
   // TODO: Refactor to avoid duplication after design settles.
-  using GroupIdT = typename GroupId<ParentGroup>::type;
-  GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
-  using OCLT = detail::ConvertToOpenCLType_t<T>;
-  using WidenedT = WidenOpenCLTypeTo32_t<OCLT>;
-  using OCLIdT = detail::ConvertToOpenCLType_t<GroupIdT>;
-  WidenedT OCLX = detail::convertDataToType<T, OCLT>(x);
-  OCLIdT OCLId = detail::convertDataToType<GroupIdT, OCLIdT>(GroupLocalId);
+  auto GroupLocalId = static_cast<typename GroupId<ParentGroup>::type>(LocalId);
+  auto OCLX = detail::convertToOpenCLType(x);
+  WidenOpenCLTypeTo32_t<decltype(OCLX)> WideOCLX = OCLX;
+  auto OCLId = detail::convertToOpenCLType(GroupLocalId);
 
   // ballot_group partitions its parent into two groups (0 and 1)
   // We have to force each group down different control flow
   // Work-items in the "false" group (0) may still be active
   if (g.get_group_id() == 1) {
     return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
-                                            OCLX, OCLId);
+                                            WideOCLX, OCLId);
   } else {
     return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
-                                            OCLX, OCLId);
+                                            WideOCLX, OCLId);
   }
 }
 template <size_t PartitionSize, typename ParentGroup, typename T, typename IdT>
@@ -300,20 +294,17 @@ EnableIfNativeBroadcast<T, IdT> GroupBroadcast(
   auto LocalId = g.get_group_linear_id() * PartitionSize + local_id;
 
   // TODO: Refactor to avoid duplication after design settles.
-  using GroupIdT = typename GroupId<ParentGroup>::type;
-  GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
-  using OCLT = detail::ConvertToOpenCLType_t<T>;
-  using WidenedT = WidenOpenCLTypeTo32_t<OCLT>;
-  using OCLIdT = detail::ConvertToOpenCLType_t<GroupIdT>;
-  WidenedT OCLX = detail::convertDataToType<T, OCLT>(x);
-  OCLIdT OCLId = detail::convertDataToType<GroupIdT, OCLIdT>(GroupLocalId);
+  auto GroupLocalId = static_cast<typename GroupId<ParentGroup>::type>(LocalId);
+  auto OCLX = detail::convertToOpenCLType(x);
+  WidenOpenCLTypeTo32_t<decltype(OCLX)> WideOCLX = OCLX;
+  auto OCLId = detail::convertToOpenCLType(GroupLocalId);
 
   // NonUniformBroadcast requires Id to be dynamically uniform, which does not
   // hold here; each partition is broadcasting a separate index. We could
   // fallback to either NonUniformShuffle or a NonUniformBroadcast per
   // partition, and it's unclear which will be faster in practice.
-  return __spirv_GroupNonUniformShuffle(group_scope<ParentGroup>::value, OCLX,
-                                        OCLId);
+  return __spirv_GroupNonUniformShuffle(group_scope<ParentGroup>::value,
+                                        WideOCLX, OCLId);
 }
 template <typename ParentGroup, typename T, typename IdT>
 EnableIfNativeBroadcast<T, IdT>
@@ -323,16 +314,13 @@ GroupBroadcast(ext::oneapi::experimental::tangle_group<ParentGroup> g, T x,
   auto LocalId = detail::IdToMaskPosition(g, local_id);
 
   // TODO: Refactor to avoid duplication after design settles.
-  using GroupIdT = typename GroupId<ParentGroup>::type;
-  GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
-  using OCLT = detail::ConvertToOpenCLType_t<T>;
-  using WidenedT = WidenOpenCLTypeTo32_t<OCLT>;
-  using OCLIdT = detail::ConvertToOpenCLType_t<GroupIdT>;
-  WidenedT OCLX = detail::convertDataToType<T, OCLT>(x);
-  OCLIdT OCLId = detail::convertDataToType<GroupIdT, OCLIdT>(GroupLocalId);
+  auto GroupLocalId = static_cast<typename GroupId<ParentGroup>::type>(LocalId);
+  auto OCLX = detail::convertToOpenCLType(x);
+  WidenOpenCLTypeTo32_t<decltype(OCLX)> WideOCLX = OCLX;
+  auto OCLId = detail::convertToOpenCLType(GroupLocalId);
 
-  return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value, OCLX,
-                                          OCLId);
+  return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
+                                          WideOCLX, OCLId);
 }
 template <typename T, typename IdT>
 EnableIfNativeBroadcast<T, IdT>
@@ -342,17 +330,15 @@ GroupBroadcast(const ext::oneapi::experimental::opportunistic_group &g, T x,
   auto LocalId = detail::IdToMaskPosition(g, local_id);
 
   // TODO: Refactor to avoid duplication after design settles.
-  using GroupIdT = typename GroupId<::sycl::sub_group>::type;
-  GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
-  using OCLT = detail::ConvertToOpenCLType_t<T>;
-  using WidenedT = WidenOpenCLTypeTo32_t<OCLT>;
-  using OCLIdT = detail::ConvertToOpenCLType_t<GroupIdT>;
-  WidenedT OCLX = detail::convertDataToType<T, OCLT>(x);
-  OCLIdT OCLId = detail::convertDataToType<GroupIdT, OCLIdT>(GroupLocalId);
+  auto GroupLocalId =
+      static_cast<typename GroupId<::sycl::sub_group>::type>(LocalId);
+  auto OCLX = detail::convertToOpenCLType(x);
+  WidenOpenCLTypeTo32_t<decltype(OCLX)> WideOCLX = OCLX;
+  auto OCLId = detail::convertToOpenCLType(GroupLocalId);
 
   return __spirv_GroupNonUniformBroadcast(
-      group_scope<ext::oneapi::experimental::opportunistic_group>::value, OCLX,
-      OCLId);
+      group_scope<ext::oneapi::experimental::opportunistic_group>::value,
+      WideOCLX, OCLId);
 }
 
 template <typename Group, typename T, typename IdT>
@@ -386,16 +372,14 @@ EnableIfNativeBroadcast<T> GroupBroadcast(Group g, T x,
     return GroupBroadcast(g, x, local_id[0]);
   }
   using IdT = vec<size_t, Dimensions>;
-  using OCLT = detail::ConvertToOpenCLType_t<T>;
-  using WidenedT = WidenOpenCLTypeTo32_t<OCLT>;
-  using OCLIdT = detail::ConvertToOpenCLType_t<IdT>;
   IdT VecId;
   for (int i = 0; i < Dimensions; ++i) {
     VecId[i] = local_id[Dimensions - i - 1];
   }
-  WidenedT OCLX = detail::convertDataToType<T, OCLT>(x);
-  OCLIdT OCLId = detail::convertDataToType<IdT, OCLIdT>(VecId);
-  return __spirv_GroupBroadcast(group_scope<Group>::value, OCLX, OCLId);
+  auto OCLX = detail::convertToOpenCLType(x);
+  WidenOpenCLTypeTo32_t<decltype(OCLX)> WideOCLX = OCLX;
+  auto OCLId = detail::convertToOpenCLType(VecId);
+  return __spirv_GroupBroadcast(group_scope<Group>::value, WideOCLX, OCLId);
 }
 template <typename ParentGroup, typename T>
 EnableIfNativeBroadcast<T>
@@ -819,8 +803,7 @@ EnableIfGenericShuffle<T> SubgroupShuffleUp(T x, uint32_t delta);
 template <typename T>
 EnableIfNativeShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
 #ifndef __NVPTX__
-  using OCLT = detail::ConvertToOpenCLType_t<T>;
-  return __spirv_SubgroupShuffleINTEL(OCLT(x),
+  return __spirv_SubgroupShuffleINTEL(convertToOpenCLType(x),
                                       static_cast<uint32_t>(local_id.get(0)));
 #else
   return __nvvm_shfl_sync_idx_i32(membermask(), x, local_id.get(0), 0x1f);
@@ -830,9 +813,8 @@ EnableIfNativeShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
 template <typename T>
 EnableIfNativeShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
 #ifndef __NVPTX__
-  using OCLT = detail::ConvertToOpenCLType_t<T>;
   return __spirv_SubgroupShuffleXorINTEL(
-      OCLT(x), static_cast<uint32_t>(local_id.get(0)));
+      convertToOpenCLType(x), static_cast<uint32_t>(local_id.get(0)));
 #else
   return __nvvm_shfl_sync_bfly_i32(membermask(), x, local_id.get(0), 0x1f);
 #endif
@@ -841,8 +823,8 @@ EnableIfNativeShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
 template <typename T>
 EnableIfNativeShuffle<T> SubgroupShuffleDown(T x, uint32_t delta) {
 #ifndef __NVPTX__
-  using OCLT = detail::ConvertToOpenCLType_t<T>;
-  return __spirv_SubgroupShuffleDownINTEL(OCLT(x), OCLT(x), delta);
+  return __spirv_SubgroupShuffleDownINTEL(convertToOpenCLType(x),
+                                          convertToOpenCLType(x), delta);
 #else
   return __nvvm_shfl_sync_down_i32(membermask(), x, delta, 0x1f);
 #endif
@@ -851,8 +833,8 @@ EnableIfNativeShuffle<T> SubgroupShuffleDown(T x, uint32_t delta) {
 template <typename T>
 EnableIfNativeShuffle<T> SubgroupShuffleUp(T x, uint32_t delta) {
 #ifndef __NVPTX__
-  using OCLT = detail::ConvertToOpenCLType_t<T>;
-  return __spirv_SubgroupShuffleUpINTEL(OCLT(x), OCLT(x), delta);
+  return __spirv_SubgroupShuffleUpINTEL(convertToOpenCLType(x),
+                                        convertToOpenCLType(x), delta);
 #else
   return __nvvm_shfl_sync_up_i32(membermask(), x, delta, 0);
 #endif
diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
index 14b36a29ccd73..26ff67e54a2a8 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
@@ -95,9 +95,8 @@ inline __SYCL_ALWAYS_INLINE std::enable_if_t<
     sycl::detail::is_svgenfloatf_v<T> || sycl::detail::is_svgenfloath_v<T>, T>
 tanh(T x) __NOEXC {
 #if defined(__NVPTX__)
-  using _ocl_T = sycl::detail::ConvertToOpenCLType_t<T>;
-  _ocl_T arg1 = sycl::detail::convertDataToType<T, _ocl_T>(x);
-  return sycl::detail::convertDataToType<_ocl_T, T>(__clc_native_tanh(arg1));
+  return sycl::detail::convertFromOpenCLTypeFor<T>(
+      __clc_native_tanh(sycl::detail::convertToOpenCLType(x)));
 #else
   return __sycl_std::__invoke_tanh<T>(x);
 #endif
@@ -145,9 +144,8 @@ inline __SYCL_ALWAYS_INLINE
     std::enable_if_t<sycl::detail::is_svgenfloath_v<T>, T>
     exp2(T x) __NOEXC {
 #if defined(__NVPTX__)
-  using _ocl_T = sycl::detail::ConvertToOpenCLType_t<T>;
-  _ocl_T arg1 = sycl::detail::convertDataToType<T, _ocl_T>(x);
-  return sycl::detail::convertDataToType<_ocl_T, T>(__clc_native_exp2(arg1));
+  return sycl::detail::convertFromOpenCLTypeFor<T>(
+      __clc_native_exp2(sycl::detail::convertToOpenCLType(x)));
 #else
   return __sycl_std::__invoke_exp2<T>(x);
 #endif
diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
index a2f2bfb321842..af68ce0e10e0f 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
@@ -34,11 +34,9 @@ inline sycl::vec<unsigned, 4> ExtractMask(ext::oneapi::sub_group_mask Mask) {
 // TODO: This may need to be generalized beyond uint32_t for big masks
 inline uint32_t CallerPositionInMask(ext::oneapi::sub_group_mask Mask) {
   sycl::vec<unsigned, 4> MemberMask = ExtractMask(Mask);
-  auto OCLMask =
-      sycl::detail::ConvertToOpenCLType_t<sycl::vec<unsigned, 4>>(MemberMask);
   return __spirv_GroupNonUniformBallotBitCount(
       __spv::Scope::Subgroup, (int)__spv::GroupOperation::ExclusiveScan,
-      OCLMask);
+      sycl::detail::convertToOpenCLType(MemberMask));
 }
 #endif
 
diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
index bf28a1792ce86..a72c517fe27b9 100644
--- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
+++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
@@ -138,10 +138,9 @@ struct sub_group_mask {
     for (int i = 0; i < 4; ++i) {
       MemberMask[i] = TmpMArray[i];
     }
-    auto OCLMask =
-        sycl::detail::ConvertToOpenCLType_t<sycl::vec<unsigned, 4>>(MemberMask);
     return __spirv_GroupNonUniformBallotBitCount(
-        __spv::Scope::Subgroup, (int)__spv::GroupOperation::Reduce, OCLMask);
+        __spv::Scope::Subgroup, (int)__spv::GroupOperation::Reduce,
+        sycl::detail::convertToOpenCLType(MemberMask));
 #else
     unsigned int count = 0;
     auto word = (Bits & valuable_bits(bits_num));
diff --git a/sycl/include/sycl/group.hpp b/sycl/include/sycl/group.hpp
index 39b1f0a0293c8..ceb0c58dcf99c 100644
--- a/sycl/include/sycl/group.hpp
+++ b/sycl/include/sycl/group.hpp
@@ -14,7 +14,7 @@
 #include <sycl/detail/common.hpp>              // for NDLoop, __SYCL_ASSERT
 #include <sycl/detail/defines.hpp>             // for __SYCL_TYPE
 #include <sycl/detail/defines_elementary.hpp>  // for __SYCL2020_DEPRECATED
-#include <sycl/detail/generic_type_traits.hpp> // for ConvertToOpenCLType_t
+#include <sycl/detail/generic_type_traits.hpp> // for convertToOpenCLType
 #include <sycl/detail/helpers.hpp>             // for Builder, getSPIRVMemo...
 #include <sycl/detail/item_base.hpp>           // for id, range
 #include <sycl/detail/type_traits.hpp>         // for is_bool, change_base_...
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 5a26623ce4003..e9ed3f2bb6e4b 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -966,8 +966,8 @@ Scheduler::GraphBuildResult Scheduler::GraphBuilder::addCG(
           ++Ev;
           continue;
         }
-        // Handle event dependencies on any commands part of another active
-        // fusion.
+        // Event dependencies on commands part of another active fusion are
+        // handled by cancelling fusion in that other queue.
         if (EvDepCmd->getQueue() != Queue && isPartOfActiveFusion(EvDepCmd)) {
           printFusionWarning(
               "Aborting fusion because of event dependency from a "
diff --git a/sycl/test-e2e/DiscardEvents/discard_events_mixed_calls.cpp b/sycl/test-e2e/DiscardEvents/discard_events_mixed_calls.cpp
index 8206ea5b449c3..8399951d16298 100644
--- a/sycl/test-e2e/DiscardEvents/discard_events_mixed_calls.cpp
+++ b/sycl/test-e2e/DiscardEvents/discard_events_mixed_calls.cpp
@@ -24,7 +24,7 @@
 // the tests, please check if they pass without the discard_events property, if
 // they don't pass then it's most likely a general issue unrelated to
 // discard_events.
-
+// REQUIRES: aspect-usm_shared_allocations
 #include <CL/sycl.hpp>
 #include <cassert>
 #include <iostream>
diff --git a/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp b/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp
index 11288d6620bfd..48ab65c68896c 100644
--- a/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp
+++ b/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp
@@ -1,7 +1,7 @@
 // RUN: %{build} -o %t.out
 
 // RUN: env SYCL_PI_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt
-
+// REQUIRES: aspect-usm_shared_allocations
 // The test checks that the last parameter is `nullptr` for all PI calls that
 // should discard events.
 // {{0|0000000000000000}} is required for various output on Linux and Windows.
diff --git a/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp b/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp
index cfe72db0c1232..96d53a632beb6 100644
--- a/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp
+++ b/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp
@@ -1,7 +1,7 @@
 // RUN: %{build} -o %t.out
 
 // RUN: env SYCL_PI_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt
-
+// REQUIRES: aspect-usm_shared_allocations
 // The test checks that the last parameter is not `nullptr` for all PI calls
 // that should discard events.
 // {{0|0000000000000000}} is required for various output on Linux and Windows.
diff --git a/sycl/test-e2e/DiscardEvents/invalid_event.cpp b/sycl/test-e2e/DiscardEvents/invalid_event.cpp
index 273e74afb6c25..17c6a492a10c2 100644
--- a/sycl/test-e2e/DiscardEvents/invalid_event.cpp
+++ b/sycl/test-e2e/DiscardEvents/invalid_event.cpp
@@ -2,7 +2,6 @@
 // https://github.com/intel/llvm/issues/7330.
 // UNSUPPORTED: opencl && gpu
 // RUN: %{build} -o %t.out
-
 // RUN: %{run} %t.out
 
 // The test checks that each PI call to the queue returns a discarded event
@@ -19,9 +18,9 @@ void QueueAPIsReturnDiscardedEvent(sycl::queue Q) {
   sycl::range<1> range(BUFFER_SIZE);
 
   auto Dev = Q.get_device();
-  int *x = sycl::malloc_shared<int>(BUFFER_SIZE, Q);
+  int *x = sycl::malloc_device<int>(BUFFER_SIZE, Q);
   assert(x != nullptr);
-  int *y = sycl::malloc_shared<int>(BUFFER_SIZE, Q);
+  int *y = sycl::malloc_device<int>(BUFFER_SIZE, Q);
   assert(y != nullptr);
 
   sycl::event DiscardedEvent;
diff --git a/sycl/test-e2e/GroupAlgorithm/exclusive_scan_over_group.cpp b/sycl/test-e2e/GroupAlgorithm/exclusive_scan_over_group.cpp
index 6411131ef33ff..8f7ee3a55352c 100644
--- a/sycl/test-e2e/GroupAlgorithm/exclusive_scan_over_group.cpp
+++ b/sycl/test-e2e/GroupAlgorithm/exclusive_scan_over_group.cpp
@@ -9,11 +9,14 @@
 #include <iostream>
 #include <sycl/sycl.hpp>
 
-template <typename T>
-sycl::event compiler_group_scan_impl(sycl::queue *queue, T *in_data,
-                                     T *out_data, int num_wg, int group_size) {
+template <typename T, typename AccessorT>
+sycl::event compiler_group_scan_impl(sycl::queue *queue, AccessorT &in_data,
+                                     AccessorT &out_data, int num_wg,
+                                     int group_size) {
   sycl::nd_range<1> thread_range(num_wg * group_size, group_size);
   sycl::event event = queue->submit([&](sycl::handler &cgh) {
+    cgh.require(in_data);
+    cgh.require(out_data);
     cgh.parallel_for(thread_range, [=](sycl::nd_item<1> item) {
       auto id = item.get_global_linear_id();
       auto group = item.get_group();
@@ -27,33 +30,35 @@ sycl::event compiler_group_scan_impl(sycl::queue *queue, T *in_data,
   return event;
 }
 
-template <typename T>
-void test_compiler_group_scan(sycl::queue *queue, T *in_data, T *out_data,
-                              int num_wg, int group_size) {
-  compiler_group_scan_impl(queue, in_data, out_data, num_wg, group_size);
+template <typename T, typename AccessorT>
+void test_compiler_group_scan(sycl::queue *queue, AccessorT &in_data,
+                              AccessorT &out_data, int num_wg, int group_size) {
+  compiler_group_scan_impl<T>(queue, in_data, out_data, num_wg, group_size);
 }
 
 int main(int argc, const char **argv) {
-  int num_wg = 1;
-  int group_size = 16;
+  constexpr int num_wg = 1;
+  constexpr int group_size = 16;
 
   sycl::queue queue;
-
-  typedef int T;
-  size_t nelems = num_wg * group_size;
-  T *data = sycl::malloc_shared<T>(nelems, queue);
-  T *result = sycl::malloc_shared<T>(nelems, queue);
-  queue.fill<T>(data, T(2), nelems).wait();
-  queue.memset(result, 0, nelems * sizeof(T)).wait();
-
-  test_compiler_group_scan(&queue, data, result, num_wg, group_size);
-  queue.wait();
-  T expected[] = {1,   2,   4,    8,    16,   32,   64,    128,
-                  256, 512, 1024, 2048, 4096, 8192, 16384, 32768};
-  for (int i = 0; i < sizeof(expected) / sizeof(T); ++i) {
-    assert(result[i] == expected[i]);
+  constexpr size_t nelems = num_wg * group_size;
+  int data[nelems];
+  int result[nelems];
+  for (size_t i = 0; i < nelems; ++i) {
+    data[i] = 2;
+    result[i] = 0;
+  }
+  sycl::buffer<int> data_buf{&data[0], sycl::range{nelems}};
+  sycl::buffer<int> result_buf{&result[0], sycl::range{nelems}};
+  sycl::accessor data_acc{data_buf};
+  sycl::accessor result_acc{result_buf};
+  test_compiler_group_scan<int>(&queue, data_acc, result_acc, num_wg,
+                                group_size);
+  sycl::host_accessor result_host{result_buf};
+  int expected[] = {1,   2,   4,    8,    16,   32,   64,    128,
+                    256, 512, 1024, 2048, 4096, 8192, 16384, 32768};
+  for (int i = 0; i < sizeof(expected) / sizeof(int); ++i) {
+    assert(result_host[i] == expected[i]);
   }
-  sycl::free(data, queue);
-  sycl::free(result, queue);
   return 0;
 }
diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
index ba0c49fa68bf7..e346ea142b759 100644
--- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp
+++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
@@ -41,27 +41,29 @@ void testRootGroup() {
           max_num_work_group_sync>(q);
   const auto props = sycl::ext::oneapi::experimental::properties{
       sycl::ext::oneapi::experimental::use_root_sync};
-
-  int *data = sycl::malloc_shared<int>(maxWGs * WorkGroupSize, q);
+  sycl::buffer<int> dataBuf{sycl::range{maxWGs * WorkGroupSize}};
   const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
-  q.parallel_for<class RootGroupKernel>(range, props, [=](sycl::nd_item<1> it) {
-    auto root = it.ext_oneapi_get_root_group();
-    data[root.get_local_id()] = root.get_local_id();
-    sycl::group_barrier(root);
+  q.submit([&](sycl::handler &h) {
+    sycl::accessor data{dataBuf, h};
+    h.parallel_for<class RootGroupKernel>(
+        range, props, [=](sycl::nd_item<1> it) {
+          auto root = it.ext_oneapi_get_root_group();
+          data[root.get_local_id()] = root.get_local_id();
+          sycl::group_barrier(root);
 
-    root = sycl::ext::oneapi::experimental::this_kernel::get_root_group<1>();
-    int sum = data[root.get_local_id()] +
-              data[root.get_local_range() - root.get_local_id() - 1];
-    sycl::group_barrier(root);
-    data[root.get_local_id()] = sum;
+          root =
+              sycl::ext::oneapi::experimental::this_kernel::get_root_group<1>();
+          int sum = data[root.get_local_id()] +
+                    data[root.get_local_range() - root.get_local_id() - 1];
+          sycl::group_barrier(root);
+          data[root.get_local_id()] = sum;
+        });
   });
-  q.wait();
-
+  sycl::host_accessor data{dataBuf};
   const int workItemCount = static_cast<int>(range.get_global_range().size());
   for (int i = 0; i < workItemCount; i++) {
     assert(data[i] == (workItemCount - 1));
   }
-  sycl::free(data, q);
 }
 
 void testRootGroupFunctions() {
@@ -76,44 +78,45 @@ void testRootGroupFunctions() {
       sycl::ext::oneapi::experimental::use_root_sync};
 
   constexpr int testCount = 10;
-  bool *testResults = sycl::malloc_shared<bool>(testCount, q);
+  sycl::buffer<bool> testResultsBuf{sycl::range{testCount}};
   const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
-  q.parallel_for<class RootGroupFunctionsKernel>(
-      range, props, [=](sycl::nd_item<1> it) {
-        const auto root = it.ext_oneapi_get_root_group();
-        if (root.leader() || root.get_local_id() == 3) {
-          testResults[0] = root.get_group_id() == sycl::id<1>(0);
-          testResults[1] = root.leader()
-                               ? root.get_local_id() == sycl::id<1>(0)
-                               : root.get_local_id() == sycl::id<1>(3);
-          testResults[2] = root.get_group_range() == sycl::range<1>(1);
-          testResults[3] =
-              root.get_local_range() == sycl::range<1>(WorkGroupSize);
-          testResults[4] =
-              root.get_max_local_range() == sycl::range<1>(WorkGroupSize);
-          testResults[5] = root.get_group_linear_id() == 0;
-          testResults[6] =
-              root.get_local_linear_id() == root.get_local_id().get(0);
-          testResults[7] = root.get_group_linear_range() == 1;
-          testResults[8] = root.get_local_linear_range() == WorkGroupSize;
-
-          const auto child =
-              sycl::ext::oneapi::experimental::get_child_group(root);
-          const auto grandchild =
-              sycl::ext::oneapi::experimental::get_child_group(child);
-          testResults[9] = child == it.get_group();
-          static_assert(
-              std::is_same_v<std::remove_cv<decltype(grandchild)>::type,
-                             sycl::sub_group>,
-              "get_child_group(sycl::group) must return a sycl::sub_group");
-        }
-      });
-  q.wait();
+  q.submit([&](sycl::handler &h) {
+    sycl::accessor testResults{testResultsBuf, h};
+    h.parallel_for<class RootGroupFunctionsKernel>(
+        range, props, [=](sycl::nd_item<1> it) {
+          const auto root = it.ext_oneapi_get_root_group();
+          if (root.leader() || root.get_local_id() == 3) {
+            testResults[0] = root.get_group_id() == sycl::id<1>(0);
+            testResults[1] = root.leader()
+                                 ? root.get_local_id() == sycl::id<1>(0)
+                                 : root.get_local_id() == sycl::id<1>(3);
+            testResults[2] = root.get_group_range() == sycl::range<1>(1);
+            testResults[3] =
+                root.get_local_range() == sycl::range<1>(WorkGroupSize);
+            testResults[4] =
+                root.get_max_local_range() == sycl::range<1>(WorkGroupSize);
+            testResults[5] = root.get_group_linear_id() == 0;
+            testResults[6] =
+                root.get_local_linear_id() == root.get_local_id().get(0);
+            testResults[7] = root.get_group_linear_range() == 1;
+            testResults[8] = root.get_local_linear_range() == WorkGroupSize;
 
+            const auto child =
+                sycl::ext::oneapi::experimental::get_child_group(root);
+            const auto grandchild =
+                sycl::ext::oneapi::experimental::get_child_group(child);
+            testResults[9] = child == it.get_group();
+            static_assert(
+                std::is_same_v<std::remove_cv<decltype(grandchild)>::type,
+                               sycl::sub_group>,
+                "get_child_group(sycl::group) must return a sycl::sub_group");
+          }
+        });
+  });
+  sycl::host_accessor testResults{testResultsBuf};
   for (int i = 0; i < testCount; i++) {
     assert(testResults[i]);
   }
-  sycl::free(testResults, q);
 }
 
 int main() {
diff --git a/sycl/test-e2e/InOrderEventsExt/get_last_event.cpp b/sycl/test-e2e/InOrderEventsExt/get_last_event.cpp
index 3393202b5a370..aeb4ab44acb49 100644
--- a/sycl/test-e2e/InOrderEventsExt/get_last_event.cpp
+++ b/sycl/test-e2e/InOrderEventsExt/get_last_event.cpp
@@ -34,7 +34,8 @@ int main() {
   Failed += Check(Q, "host_task", [&]() {
     return Q.submit([&](sycl::handler &CGH) { CGH.host_task([]() {}); });
   });
-
+  if (!Q.get_device().has(sycl::aspect::usm_shared_allocations))
+    return Failed;
   constexpr size_t N = 64;
   int *Data1 = sycl::malloc_shared<int>(N, Q);
   int *Data2 = sycl::malloc_shared<int>(N, Q);
diff --git a/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp b/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp
index 45e5815606dbe..61987d5b5b3cb 100644
--- a/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp
+++ b/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp
@@ -15,20 +15,26 @@ int main() {
   sycl::queue Q1{Ctx, Dev, {sycl::property::queue::in_order{}}};
   sycl::queue Q2{Ctx, Dev, {sycl::property::queue::in_order{}}};
 
-  int *DevData = sycl::malloc_shared<int>(N, Dev, Ctx);
+  sycl::buffer<int> DevDataBuf{sycl::range{N}};
+  sycl::accessor DevData{DevDataBuf};
   int *HostData = (int *)malloc(N * sizeof(int) * 10);
 
   for (size_t I = 0; I < 10; ++I) {
-    Q1.fill<int>(DevData, 0, N);
-    sycl::event E1 = Q1.parallel_for(
-        N, [=](sycl::item<1> Idx) { DevData[Idx] = 42 + Idx[0] + I; });
+    Q1.fill(DevData, 0);
+    sycl::event E1 = Q1.submit([&](sycl::handler &h) {
+      h.require(DevData);
+      h.parallel_for(
+          N, [=](sycl::item<1> Idx) { DevData[Idx] = 42 + Idx[0] + I; });
+    });
 
     Q2.ext_oneapi_set_external_event(E1);
-    sycl::event E2 =
-        Q2.parallel_for(N, [=](sycl::item<1> Idx) { ++DevData[Idx]; });
+    sycl::event E2 = Q2.submit([&](sycl::handler &h) {
+      h.require(DevData);
+      h.parallel_for(N, [=](sycl::item<1> Idx) { ++DevData[Idx]; });
+    });
 
     Q1.ext_oneapi_set_external_event(E2);
-    Q1.copy(DevData, HostData + N * I, N);
+    Q1.copy(DevData, HostData + N * I);
   }
 
   Q1.wait_and_throw();
@@ -46,9 +52,6 @@ int main() {
       }
     }
   }
-
-  sycl::free(DevData, Ctx);
   free(HostData);
-
   return Failures;
 }
diff --git a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp
index 4276efe36366f..c50e23b1c17c4 100644
--- a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp
+++ b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp
@@ -74,7 +74,7 @@ int main() {
   // CHECK-CACHE: piKernelRelease
   // CHECK-CACHE: piProgramRelease
   // CHECK-CACHE: piEventsWait
-  auto *p = malloc_shared<int>(1, q);
+  auto *p = malloc_device<int>(1, q);
   for (int i = 0; i < 2; ++i)
     q.submit([&](handler &cgh) {
        cgh.set_specialization_constant<spec_id>(i);
diff --git a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
index 38286c48b8a0f..79347114ec2a4 100644
--- a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
@@ -14,19 +14,21 @@ int main() {
   queue q1{ext::codeplay::experimental::property::queue::enable_fusion{}};
   queue q2{ext::codeplay::experimental::property::queue::enable_fusion{}};
 
-  int *in1 = sycl::malloc_shared<int>(dataSize, q1);
-  int *in2 = sycl::malloc_shared<int>(dataSize, q1);
-  int *in3 = sycl::malloc_shared<int>(dataSize, q1);
-  int *tmp = sycl::malloc_shared<int>(dataSize, q1);
-  int *out = sycl::malloc_shared<int>(dataSize, q1);
-
-  for (size_t i = 0; i < dataSize; ++i) {
-    in1[i] = i * 2;
-    in2[i] = i * 3;
-    in3[i] = i * 4;
-    tmp[i] = -1;
-    out[i] = -1;
-  }
+  int *in1 = sycl::malloc_device<int>(dataSize, q1);
+  int *in2 = sycl::malloc_device<int>(dataSize, q1);
+  int *in3 = sycl::malloc_device<int>(dataSize, q1);
+  int *tmp = sycl::malloc_device<int>(dataSize, q1);
+  int *out = sycl::malloc_device<int>(dataSize, q1);
+
+  q1.single_task<class InitKernel>([=]() {
+      for (size_t i = 0; i < dataSize; ++i) {
+        in1[i] = i * 2;
+        in2[i] = i * 3;
+        in3[i] = i * 4;
+        tmp[i] = -1;
+        out[i] = -1;
+      }
+    }).wait();
 
   ext::codeplay::experimental::fusion_wrapper fw1{q1};
   fw1.start_fusion();
@@ -71,10 +73,12 @@ int main() {
 
   q1.wait();
   q2.wait();
-
+  int host_out[dataSize];
+  q1.memcpy(host_out, out, dataSize * sizeof(int));
+  q1.wait();
   // Check the results
   for (size_t i = 0; i < dataSize; ++i) {
-    assert(out[i] == (40 * i * i) && "Computation error");
+    assert(host_out[i] == (40 * i * i) && "Computation error");
   }
   sycl::free(in1, q1);
   sycl::free(in2, q1);
diff --git a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
index 0e6bc288812fd..f3d2cc97b9ae5 100644
--- a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
@@ -1,6 +1,5 @@
 // RUN: %{build} -fsycl-embed-ir -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
-
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
 
@@ -16,21 +15,24 @@ int main() {
 
   queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
 
-  int *in1 = sycl::malloc_shared<int>(dataSize, q);
-  int *in2 = sycl::malloc_shared<int>(dataSize, q);
-  int *in3 = sycl::malloc_shared<int>(dataSize, q);
-  int *tmp = sycl::malloc_shared<int>(dataSize, q);
-  int *out = sycl::malloc_shared<int>(dataSize, q);
+  int *in1 = sycl::malloc_device<int>(dataSize, q);
+  int *in2 = sycl::malloc_device<int>(dataSize, q);
+  int *in3 = sycl::malloc_device<int>(dataSize, q);
+  int *tmp = sycl::malloc_device<int>(dataSize, q);
+  int *out = sycl::malloc_device<int>(dataSize, q);
   int dst[dataSize];
-
   for (size_t i = 0; i < dataSize; ++i) {
-    in1[i] = i * 2;
-    in2[i] = i * 3;
-    in3[i] = i * 4;
-    tmp[i] = -1;
-    out[i] = -1;
     dst[i] = -1;
   }
+  q.single_task<class InitKernel>([=]() {
+     for (size_t i = 0; i < dataSize; ++i) {
+       in1[i] = i * 2;
+       in2[i] = i * 3;
+       in3[i] = i * 4;
+       tmp[i] = -1;
+       out[i] = -1;
+     }
+   }).wait();
 
   ext::codeplay::experimental::fusion_wrapper fw{q};
   fw.start_fusion();
@@ -59,14 +61,16 @@ int main() {
          "Queue should not be in fusion mode anymore");
 
   fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
-
+  int host_out[dataSize];
+  q.memcpy(host_out, out, dataSize * sizeof(int));
+  q.wait();
   for (size_t i = 0; i < dataSize; ++i) {
-    std::cout << out[i] << ", ";
+    std::cout << host_out[i] << ", ";
   }
   std::cout << "\n";
   // Check the results
   for (size_t i = 0; i < dataSize; ++i) {
-    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(host_out[i] == (20 * i * i) && "Computation error");
     assert(dst[i] == (5 * i) && "Computation error");
   }
 
diff --git a/sycl/test-e2e/Reduction/reduction_internal.cpp b/sycl/test-e2e/Reduction/reduction_internal.cpp
index 57947a25176e1..39c64932802da 100644
--- a/sycl/test-e2e/Reduction/reduction_internal.cpp
+++ b/sycl/test-e2e/Reduction/reduction_internal.cpp
@@ -80,13 +80,13 @@ static void test(RedStorage &Storage, RangeTy Range) {
          cgh, Range, ext::oneapi::experimental::empty_properties_t{}, RedSycl,
          [=](auto Item, auto &Red) { Red.combine(T{1}); });
    }).wait();
-
-  auto *Result = malloc_shared<T>(1, q);
+  sycl::buffer<T> ResultBuf{sycl::range{1}};
   q.submit([&](handler &cgh) {
-     auto RedAcc = GetRedAcc(cgh);
-     cgh.single_task([=]() { *Result = RedAcc[0]; });
-   }).wait();
-
+    sycl::accessor Result{ResultBuf, cgh};
+    auto RedAcc = GetRedAcc(cgh);
+    cgh.single_task([=]() { Result[0] = RedAcc[0]; });
+  });
+  sycl::host_accessor Result{ResultBuf};
   auto N = get_global_range(Range).size();
   int Expected = InitToIdentity ? N : Init + N;
 #if defined(__PRETTY_FUNCTION__)
@@ -94,10 +94,8 @@ static void test(RedStorage &Storage, RangeTy Range) {
 #elif defined(__FUNCSIG__)
   std::cout << __FUNCSIG__;
 #endif
-  std::cout << ": " << *Result << ", expected " << Expected << std::endl;
-  assert(*Result == Expected);
-
-  free(Result, q);
+  std::cout << ": " << Result[0] << ", expected " << Expected << std::endl;
+  assert(Result[0] == Expected);
 }
 
 template <int... Inds, class F>
diff --git a/sycl/test-e2e/Reduction/reduction_range_item.cpp b/sycl/test-e2e/Reduction/reduction_range_item.cpp
index ed9806e4753b5..93fe915ac0254 100644
--- a/sycl/test-e2e/Reduction/reduction_range_item.cpp
+++ b/sycl/test-e2e/Reduction/reduction_range_item.cpp
@@ -3,18 +3,20 @@
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
-
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
 
 int main() {
   queue q;
-  auto *RedMem = malloc_shared<int>(1, q);
-  auto *Success = malloc_shared<bool>(1, q);
-  *Success = true;
-
-  *RedMem = 0;
+  auto *RedMem = malloc_device<int>(1, q);
+  auto *Success = malloc_device<bool>(1, q);
+  int RedMemHost;
+  bool SuccessHost;
+  RedMemHost = 0;
+  SuccessHost = true;
+  q.memcpy(RedMem, &RedMemHost, sizeof(int)).wait();
+  q.memcpy(Success, &SuccessHost, sizeof(bool)).wait();
   q.parallel_for(range<1>{7}, reduction(RedMem, std::plus<int>{}),
                  [=](item<1> Item, auto &Red) {
                    Red += 1;
@@ -24,11 +26,13 @@ int main() {
                      *Success = false;
                  })
       .wait();
+  q.memcpy(&RedMemHost, RedMem, sizeof(int)).wait();
+  q.memcpy(&SuccessHost, Success, sizeof(bool)).wait();
+  assert(RedMemHost == 7);
+  assert(SuccessHost);
 
-  assert(*RedMem == 7);
-  assert(*Success);
-
-  *RedMem = 0;
+  RedMemHost = 0;
+  q.memcpy(RedMem, &RedMemHost, sizeof(int)).wait();
   q.parallel_for(range<2>{1030, 7}, reduction(RedMem, std::plus<int>{}),
                  [=](item<2> Item, auto &Red) {
                    Red += 1;
@@ -44,8 +48,10 @@ int main() {
                  })
       .wait();
 
-  assert(*RedMem == 1030 * 7);
-  assert(*Success);
+  q.memcpy(&RedMemHost, RedMem, sizeof(int)).wait();
+  q.memcpy(&SuccessHost, Success, sizeof(bool)).wait();
+  assert(RedMemHost == 1030 * 7);
+  assert(SuccessHost);
 
   free(RedMem, q);
   free(Success, q);
diff --git a/sycl/test-e2e/Reduction/reduction_span.cpp b/sycl/test-e2e/Reduction/reduction_span.cpp
index 6c06d377eabe2..23ee61e805f9b 100644
--- a/sycl/test-e2e/Reduction/reduction_span.cpp
+++ b/sycl/test-e2e/Reduction/reduction_span.cpp
@@ -3,7 +3,6 @@
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
-
 // This test performs basic checks of reductions initialized with a sycl::span
 
 #include <sycl/sycl.hpp>
@@ -44,7 +43,7 @@ template <size_t N, typename T, typename BinaryOperation, typename Range,
 void test(queue Q, Range Rng, T Identity, T Value) {
 
   // Initialize output to identity value
-  T *Output = malloc_shared<T>(N, Q);
+  T *Output = malloc_device<T>(N, Q);
   Q.parallel_for(range<1>{N}, [=](id<1> I) { Output[I] = Identity; }).wait();
 
   // Perform generalized "histogram" with N bins
@@ -70,11 +69,13 @@ void test(queue Q, Range Rng, T Identity, T Value) {
   }
 
   bool Passed = true;
+  T OutputHost[N];
+  Q.memcpy(OutputHost, Output, N * sizeof(T)).wait();
   for (size_t I = 0; I < N; ++I) {
     if (I < Size % N) {
-      Passed &= (Output[I] == Expected);
+      Passed &= (OutputHost[I] == Expected);
     } else {
-      Passed &= (Output[I] == ExpectedRemainder);
+      Passed &= (OutputHost[I] == ExpectedRemainder);
     }
   }
 
diff --git a/sycl/test-e2e/Reduction/reduction_span_pack.cpp b/sycl/test-e2e/Reduction/reduction_span_pack.cpp
index 46862ffe45cf9..4bc4cfb5f9f14 100644
--- a/sycl/test-e2e/Reduction/reduction_span_pack.cpp
+++ b/sycl/test-e2e/Reduction/reduction_span_pack.cpp
@@ -6,7 +6,6 @@
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
-
 // This test performs basic checks of reductions initialized with a pack
 // containing at least one sycl::span
 
@@ -49,9 +48,9 @@ template <size_t N, typename T, typename BinaryOperation, typename Range,
 void test1(queue Q, Range Rng, T Identity, T Value) {
 
   // Initialize output to identity value
-  int *Sum = malloc_shared<int>(1, Q);
+  int *Sum = malloc_device<int>(1, Q);
   Q.single_task([=]() { *Sum = 0; }).wait();
-  T *Output = malloc_shared<T>(N, Q);
+  T *Output = malloc_device<T>(N, Q);
   Q.parallel_for(range<1>{N}, [=](id<1> I) { Output[I] = Identity; }).wait();
 
   // Perform generalized "histogram" with N bins
@@ -81,14 +80,18 @@ void test1(queue Q, Range Rng, T Identity, T Value) {
   }
 
   bool Passed = true;
+  T OutputHost[N];
+  Q.memcpy(OutputHost, Output, N * sizeof(T)).wait();
   for (size_t I = 0; I < N; ++I) {
     if (I < Size % N) {
-      Passed &= (Output[I] == Expected);
+      Passed &= (OutputHost[I] == Expected);
     } else {
-      Passed &= (Output[I] == ExpectedRemainder);
+      Passed &= (OutputHost[I] == ExpectedRemainder);
     }
   }
-  Passed &= (*Sum == Size);
+  int SumHost;
+  Q.memcpy(&SumHost, Sum, sizeof(int)).wait();
+  Passed &= (SumHost == Size);
 
   free(Output, Q);
   free(Sum, Q);
@@ -101,9 +104,9 @@ template <size_t N, typename T, typename BinaryOperation, typename Range,
 void test2(queue Q, Range Rng, T Identity, T Value) {
 
   // Initialize output to identity value
-  int *Output1 = malloc_shared<int>(N, Q);
+  int *Output1 = malloc_device<int>(N, Q);
   Q.parallel_for(range<1>{N}, [=](id<1> I) { Output1[I] = 0; }).wait();
-  T *Output2 = malloc_shared<T>(N, Q);
+  T *Output2 = malloc_device<T>(N, Q);
   Q.parallel_for(range<1>{N}, [=](id<1> I) { Output2[I] = Identity; }).wait();
 
   // Perform generalized "histogram" with N bins
@@ -121,7 +124,10 @@ void test2(queue Q, Range Rng, T Identity, T Value) {
   } else /*if (SubmissionMode == submission_mode::queue) */ {
     Q.parallel_for(Rng, Redu1, Redu2, Kern).wait();
   }
-
+  int Output1Host[N];
+  T Output2Host[N];
+  Q.memcpy(Output1Host, Output1, N * sizeof(int)).wait();
+  Q.memcpy(Output2Host, Output2, N * sizeof(T)).wait();
   size_t Size = getLinearSize(Rng);
   bool Passed = true;
   // Span1
@@ -132,12 +138,11 @@ void test2(queue Q, Range Rng, T Identity, T Value) {
       ExpectedRemainder = Expected;
       Expected += 1;
     }
-
     for (size_t I = 0; I < N; ++I) {
       if (I < Size % N) {
-        Passed &= (Output1[I] == Expected);
+        Passed &= (Output1Host[I] == Expected);
       } else {
-        Passed &= (Output1[I] == ExpectedRemainder);
+        Passed &= (Output1Host[I] == ExpectedRemainder);
       }
     }
   }
@@ -153,9 +158,9 @@ void test2(queue Q, Range Rng, T Identity, T Value) {
 
     for (size_t I = 0; I < N; ++I) {
       if (I < Size % N) {
-        Passed &= (Output2[I] == Expected);
+        Passed &= (Output2Host[I] == Expected);
       } else {
-        Passed &= (Output2[I] == ExpectedRemainder);
+        Passed &= (Output2Host[I] == ExpectedRemainder);
       }
     }
   }
diff --git a/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp b/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp
index a7d3601210fa8..81e5ddc8b1a27 100644
--- a/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp
+++ b/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp
@@ -1,6 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-
+// REQUIRES: aspect-usm_shared_allocations
 // This test ensures the result computed by exclusive_scan_over_group
 // for the first work item when given a short or char argument with
 // the maximum or minimum operator is computed correctly.
diff --git a/sycl/test-e2e/Regression/group_local_linear_id.cpp b/sycl/test-e2e/Regression/group_local_linear_id.cpp
index d3562c2b25cbf..8fc216f76df31 100644
--- a/sycl/test-e2e/Regression/group_local_linear_id.cpp
+++ b/sycl/test-e2e/Regression/group_local_linear_id.cpp
@@ -11,14 +11,17 @@ int main() {
   const sycl::range<3> GlobalRange(2, 8, 16);
   const sycl::range<3> LocalRange(2, 4, 4);
   sycl::queue Q;
-  bool *ReadSame = sycl::malloc_shared<bool>(GlobalRange.size(), Q);
-  Q.parallel_for(sycl::nd_range<3>{GlobalRange, LocalRange},
-                 [=](sycl::nd_item<3> Item) {
-                   ReadSame[Item.get_global_linear_id()] =
-                       Item.get_local_linear_id() ==
-                       Item.get_group().get_local_linear_id();
-                 })
-      .wait();
+  sycl::buffer<bool> ReadSame_buf{GlobalRange.size()};
+  Q.submit([&](sycl::handler &h) {
+    sycl::accessor ReadSame{ReadSame_buf, h};
+    h.parallel_for(sycl::nd_range<3>{GlobalRange, LocalRange},
+                   [=](sycl::nd_item<3> Item) {
+                     ReadSame[Item.get_global_linear_id()] =
+                         Item.get_local_linear_id() ==
+                         Item.get_group().get_local_linear_id();
+                   });
+  });
+  sycl::host_accessor ReadSame{ReadSame_buf};
   int Failures = 0;
   for (size_t I = 0; I < GlobalRange.size(); ++I) {
     if (ReadSame[I])
@@ -26,6 +29,5 @@ int main() {
     ++Failures;
     std::cout << "Read mismatch at index " << I << std::endl;
   }
-  sycl::free(ReadSame, Q);
   return Failures;
 }
diff --git a/sycl/test-e2e/Regression/half_operators.cpp b/sycl/test-e2e/Regression/half_operators.cpp
index b227806a02bc0..14f6924260ec6 100644
--- a/sycl/test-e2e/Regression/half_operators.cpp
+++ b/sycl/test-e2e/Regression/half_operators.cpp
@@ -7,11 +7,6 @@
 
 using namespace sycl;
 
-template <typename T>
-using shared_allocator = sycl::usm_allocator<T, sycl::usm::alloc::shared>;
-
-template <typename T> using shared_vector = std::vector<T, shared_allocator<T>>;
-
 template <typename T> bool are_bitwise_equal(T lhs, T rhs) {
   constexpr size_t size{sizeof(T)};
 
@@ -33,33 +28,34 @@ template <typename T> bool test(sycl::queue &queue) {
 
   static const T inexact = static_cast<T>(0.1);
 
-  shared_vector<T> result_source{NumElems, shared_allocator<T>{queue}};
-  shared_vector<T> input{NumElems, shared_allocator<T>{queue}};
+  std::vector<T> result_source_vec(NumElems);
+  std::vector<T> input_vec(NumElems);
 
   for (size_t i = 0; i < NumElems; ++i) {
-    input[i] = inexact * i;
+    input_vec[i] = inexact * i;
   }
-
-  queue.submit([&](sycl::handler &cgh) {
-    auto out_source = result_source.data();
-    auto in = input.data();
-
-    cgh.single_task<>([=]() {
-      for (size_t i = 0; i < NumElems; ++i) {
-        auto source = in[i];
-        ++source;
-        out_source[i] = source;
-      }
+  {
+    sycl::buffer<T> result_source_buf{result_source_vec};
+    sycl::buffer<T> input_buf{input_vec};
+    queue.submit([&](sycl::handler &cgh) {
+      sycl::accessor out_source{result_source_buf, cgh};
+      sycl::accessor in{input_buf, cgh};
+      cgh.single_task<>([=]() {
+        for (size_t i = 0; i < NumElems; ++i) {
+          auto source = in[i];
+          ++source;
+          out_source[i] = source;
+        }
+      });
     });
-  });
-  queue.wait_and_throw();
-
+    queue.wait_and_throw();
+  } // buffers go out of scope here and write back to the vectors
   for (size_t i = 0; i < NumElems; ++i) {
-    T expected_value = input[i] + 1;
+    T expected_value = input_vec[i] + 1;
 
-    if (!are_bitwise_equal(expected_value, result_source[i])) {
+    if (!are_bitwise_equal(expected_value, result_source_vec[i])) {
       pass = false;
-      std::cout << "Sample failed retrieved value: " << result_source[i]
+      std::cout << "Sample failed retrieved value: " << result_source_vec[i]
                 << ", but expected: " << expected_value << ", at index: " << i
                 << std::endl;
     }
diff --git a/sycl/test-e2e/Regression/pf-wg-atomic64.cpp b/sycl/test-e2e/Regression/pf-wg-atomic64.cpp
index f2985b5a33b1d..d7adc4e6c153b 100644
--- a/sycl/test-e2e/Regression/pf-wg-atomic64.cpp
+++ b/sycl/test-e2e/Regression/pf-wg-atomic64.cpp
@@ -10,11 +10,12 @@ using AtomicRefT =
 
 int main() {
   queue q;
-  auto *p = malloc_shared<unsigned long long>(1, q);
+  sycl::buffer<unsigned long long> p_buf{sycl::range{1}};
   try {
     q.submit([&](sycl::handler &cgh) {
+       sycl::accessor p{p_buf, cgh};
        cgh.parallel_for_work_group(range{1}, range{1}, [=](group<1>) {
-         AtomicRefT feature(*p);
+         AtomicRefT feature(p[0]);
          feature += 42;
        });
      }).wait();
diff --git a/sycl/test-e2e/Regression/range-rounding-this-id.cpp b/sycl/test-e2e/Regression/range-rounding-this-id.cpp
index 33fa41c60cc68..a5970adf46e41 100644
--- a/sycl/test-e2e/Regression/range-rounding-this-id.cpp
+++ b/sycl/test-e2e/Regression/range-rounding-this-id.cpp
@@ -30,13 +30,17 @@ template <int D> void test(queue &q) {
     id<D> this_id;
     id<D> ref_id;
   };
-  std::vector<T, usm_allocator<T, usm::alloc::shared>> vec(range.size(), q);
-  auto *p = vec.data();
-  q.parallel_for(range, [=](auto it) {
-     p[it.get_linear_id()] = {sycl::ext::oneapi::experimental::this_id<D>(),
-                              it.get_id()};
-   }).wait_and_throw();
-
+  std::vector<T> vec(range.size());
+  {
+    sycl::buffer<T> p_buf{vec};
+    q.submit([&](sycl::handler &h) {
+       sycl::accessor p{p_buf, h};
+       h.parallel_for(range, [=](auto it) {
+         p[it.get_linear_id()] = {sycl::ext::oneapi::experimental::this_id<D>(),
+                                  it.get_id()};
+       });
+     }).wait_and_throw();
+  } // p_buf goes out of scope here and writed back to vec
   for (const auto &[this_item, ref_item] : vec) {
     if (this_item != ref_item) {
       std::cout << "fail: " << this_item << " != " << ref_item << "\n";
diff --git a/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp b/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp
index dc138d9b79da8..c2d0d3e84ca7c 100644
--- a/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp
+++ b/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp
@@ -2,7 +2,6 @@
 // RUN: %{build} -o %t.out
 //
 // RUN: %{run} %t.out
-
 // Tests that a previously known case for reduction doesn't cause a requirement
 // for atomic64.
 // TODO: When aspect requirements are added to testing, this test could be set
@@ -19,7 +18,7 @@ using namespace sycl;
 int main() {
   queue Q;
 
-  long long *Out = malloc_shared<long long>(1, Q);
+  long long *Out = malloc_device<long long>(1, Q);
 
   // Case 1: nd_range reduction with 64-bit integer and either sycl::plus,
   // sycl::minimum or sycl::maximum. group_reduce_and_atomic_cross_wg strategy
diff --git a/sycl/test/type_traits/type_traits.cpp b/sycl/test/type_traits/type_traits.cpp
index 52de6a18570ef..3c53822517d62 100644
--- a/sycl/test/type_traits/type_traits.cpp
+++ b/sycl/test/type_traits/type_traits.cpp
@@ -227,5 +227,23 @@ int main() {
   test_is_same_vector_size<true, s::constant_ptr<s::int2>, s::int2>();
   test_is_same_vector_size<false, s::constant_ptr<s::int2>, float>();
 
+#ifdef __SYCL_DEVICE_ONLY__
+  static_assert(
+      std::is_same_v<
+          s::remove_decoration_t<const __attribute__((opencl_global)) int>,
+          const int>);
+  static_assert(
+      std::is_same_v<s::remove_decoration_t<const volatile
+                                            __attribute__((opencl_global)) int>,
+                     const volatile int>);
+  static_assert(
+      std::is_same_v<
+          s::remove_decoration_t<const __attribute__((opencl_global)) int *>,
+          const int *>);
+  static_assert(std::is_same_v<s::remove_decoration_t<const __attribute__((
+                                   opencl_global)) int *const>,
+                               const int *const>);
+#endif
+
   return 0;
 }