From 74047b61bcf9724fba995372cfff882b9d8dc2d7 Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Tue, 1 Jul 2025 13:49:34 +0200 Subject: [PATCH 01/32] =?UTF-8?q?[SYCL]=20Align=20Non-Uniform=20Groups=20w?= =?UTF-8?q?ith=20PR#14604:=20WIP:=20-D=5FFORTIFY=5FSOURCE=3D2=20=20=20*=20?= =?UTF-8?q?fixed=5Fsize=5Fgroup=20=E2=86=92=20chunk=20=20=20*=20tangle=5Fg?= =?UTF-8?q?roup=20=E2=86=92=20tangle=20=20=20*=20ballot=5Fgroup=20?= =?UTF-8?q?=E2=86=92=20fragment=20=20=20*=20opportunistic=5Fgroup=20?= =?UTF-8?q?=E2=86=92=20merged=20into=20fragment=20=20=20*=20get=5Fballot?= =?UTF-8?q?=5Fgroup()=20=E2=86=92=20binary=5Fpartition()=20=20=20*=20get?= =?UTF-8?q?=5Ffixed=5Fsize=5Fgroup()=20=E2=86=92=20chunked=5Fpartition()=20=20=20*=20get=5Ftangle=5Fgroup()=20=E2=86=92=20entangle(?= =?UTF-8?q?)=20=20=20+=20is=5Fuser=5Fconstructed=5Fupdated=20and=20HasExte?= =?UTF-8?q?nsionWordBoundary=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llvm/cmake/modules/AddSecurityFlags.cmake | 12 +- .../llvm/SYCLLowerIR/DeviceConfigFile.td | 27 +- sycl/include/sycl/detail/spirv.hpp | 172 ++++++++----- .../ext/oneapi/experimental/ballot_group.hpp | 182 -------------- .../sycl/ext/oneapi/experimental/chunk.hpp | 231 ++++++++++++++++++ .../oneapi/experimental/fixed_size_group.hpp | 194 --------------- .../{opportunistic_group.hpp => fragment.hpp} | 117 ++++++--- .../experimental/non_uniform_groups.hpp | 26 +- .../{tangle_group.hpp => tangle.hpp} | 85 ++++--- sycl/include/sycl/info/aspects.def | 65 +++-- sycl/include/sycl/sycl.hpp | 11 +- sycl/source/detail/device_impl.cpp | 3 +- sycl/source/detail/device_impl.hpp | 32 +-- sycl/test-e2e/Assert/check_resource_leak.cpp | 2 +- .../{fixed_size_group.cpp => chunk.cpp} | 20 +- ...up_algorithms.cpp => chunk_algorithms.cpp} | 34 +-- .../{ballot_group.cpp => fragment.cpp} | 20 +- ...algorithms.cpp => fragment_algorithms.cpp} | 6 +- ...ortunistic_group.cpp => opportunistic.cpp} | 28 ++- ...ithms.cpp => opportunistic_algorithms.cpp} | 6 +- .../{tangle_group.cpp => tangle.cpp} | 12 +- ...p_algorithms.cpp => tangle_algorithms.cpp} | 10 +- .../no-unsupported-without-info.cpp | 10 +- .../is_user_constructed.cpp | 29 ++- sycl/test/non-uniform-groups/type_traits.cpp | 31 +++ sycl/test/regression/group_algorithms.cpp | 12 +- sycl/unittests/context_device/CMakeLists.txt | 1 + .../HasExtensionWordBoundary.cpp | 125 ++++++++++ 28 files changed, 842 insertions(+), 661 deletions(-) delete mode 100644 sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp create mode 100644 sycl/include/sycl/ext/oneapi/experimental/chunk.hpp delete mode 100644 sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp rename sycl/include/sycl/ext/oneapi/experimental/{opportunistic_group.hpp => fragment.hpp} (53%) rename sycl/include/sycl/ext/oneapi/experimental/{tangle_group.hpp => tangle.hpp} (64%) rename sycl/test-e2e/NonUniformGroups/{fixed_size_group.cpp => chunk.cpp} (75%) rename sycl/test-e2e/NonUniformGroups/{fixed_size_group_algorithms.cpp => chunk_algorithms.cpp} (85%) rename sycl/test-e2e/NonUniformGroups/{ballot_group.cpp => fragment.cpp} (79%) rename sycl/test-e2e/NonUniformGroups/{ballot_group_algorithms.cpp => fragment_algorithms.cpp} (97%) rename sycl/test-e2e/NonUniformGroups/{opportunistic_group.cpp => opportunistic.cpp} (73%) rename sycl/test-e2e/NonUniformGroups/{opportunistic_group_algorithms.cpp => opportunistic_algorithms.cpp} (97%) rename sycl/test-e2e/NonUniformGroups/{tangle_group.cpp => tangle.cpp} (90%) rename sycl/test-e2e/NonUniformGroups/{tangle_group_algorithms.cpp => tangle_algorithms.cpp} (96%) create mode 100644 sycl/test/non-uniform-groups/type_traits.cpp create mode 100644 sycl/unittests/context_device/HasExtensionWordBoundary.cpp diff --git a/llvm/cmake/modules/AddSecurityFlags.cmake b/llvm/cmake/modules/AddSecurityFlags.cmake index 7f49778e0a5b1..6e1e7058f1dd4 100644 --- a/llvm/cmake/modules/AddSecurityFlags.cmake +++ b/llvm/cmake/modules/AddSecurityFlags.cmake @@ -168,18 +168,18 @@ macro(append_common_extra_security_flags) if(LLVM_ON_UNIX) # Fortify Source (strongly recommended): if(CMAKE_BUILD_TYPE STREQUAL "Debug") - message(WARNING "-D_FORTIFY_SOURCE=3 can only be used with optimization.") - message(WARNING "-D_FORTIFY_SOURCE=3 is not supported.") + message(WARNING "-D_FORTIFY_SOURCE=2 can only be used with optimization.") + message(WARNING "-D_FORTIFY_SOURCE=2 is not supported.") else() # Sanitizers do not work with checked memory functions, such as # __memset_chk. We do not build release packages with sanitizers, so just - # avoid -D_FORTIFY_SOURCE=3 under LLVM_USE_SANITIZER. + # avoid -D_FORTIFY_SOURCE=2 under LLVM_USE_SANITIZER. if(NOT LLVM_USE_SANITIZER) - message(STATUS "Building with -D_FORTIFY_SOURCE=3") - add_definitions(-D_FORTIFY_SOURCE=3) + message(STATUS "Building with -D_FORTIFY_SOURCE=2") + add_definitions(-D_FORTIFY_SOURCE=2) else() message( - WARNING "-D_FORTIFY_SOURCE=3 dropped due to LLVM_USE_SANITIZER.") + WARNING "-D_FORTIFY_SOURCE=2 dropped due to LLVM_USE_SANITIZER.") endif() endif() diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td index ab90c3d3b2820..961112d1725c5 100644 --- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td +++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td @@ -70,10 +70,9 @@ def AspectExt_oneapi_unique_addressing_per_dim : Aspect<"ext_oneapi_unique_addre def AspectExt_oneapi_bindless_images_sample_1d_usm : Aspect<"ext_oneapi_bindless_images_sample_1d_usm">; def AspectExt_oneapi_bindless_images_sample_2d_usm : Aspect<"ext_oneapi_bindless_images_sample_2d_usm">; def AspectExt_intel_esimd : Aspect<"ext_intel_esimd">; -def AspectExt_oneapi_ballot_group : Aspect<"ext_oneapi_ballot_group">; -def AspectExt_oneapi_fixed_size_group : Aspect<"ext_oneapi_fixed_size_group">; -def AspectExt_oneapi_opportunistic_group : Aspect<"ext_oneapi_opportunistic_group">; -def AspectExt_oneapi_tangle_group : Aspect<"ext_oneapi_tangle_group">; +def AspectExt_oneapi_fragment : Aspect<"ext_oneapi_fragment">; +def AspectExt_oneapi_chunk : Aspect<"ext_oneapi_chunk">; +def AspectExt_oneapi_tangle : Aspect<"ext_oneapi_tangle">; def AspectExt_intel_matrix : Aspect<"ext_intel_matrix">; def AspectExt_oneapi_is_composite : Aspect<"ext_oneapi_is_composite">; def AspectExt_oneapi_is_component : Aspect<"ext_oneapi_is_component">; @@ -152,8 +151,8 @@ def : TargetInfo<"__TestAspectList", AspectExt_oneapi_bindless_sampled_image_fetch_3d, AspectExt_oneapi_bindless_images_gather, AspectExt_intel_esimd, - AspectExt_oneapi_ballot_group, AspectExt_oneapi_fixed_size_group, AspectExt_oneapi_opportunistic_group, - AspectExt_oneapi_tangle_group, AspectExt_intel_matrix, AspectExt_oneapi_is_composite, AspectExt_oneapi_is_component, + AspectExt_oneapi_fragment, AspectExt_oneapi_chunk, + AspectExt_oneapi_tangle, AspectExt_intel_matrix, AspectExt_oneapi_is_composite, AspectExt_oneapi_is_component, AspectExt_oneapi_graph, AspectExt_oneapi_limited_graph, AspectExt_oneapi_private_alloca, AspectExt_oneapi_queue_profiling_tag, AspectExt_oneapi_virtual_mem, AspectExt_oneapi_cuda_cluster_group, AspectExt_intel_fpga_task_sequence, @@ -175,9 +174,9 @@ def : TargetInfo<"__TestDeprecatedAspectList", defvar IntelCpuAspects = [ AspectCpu, AspectFp16, AspectFp64, AspectQueue_profiling, AspectAtomic64, AspectExt_oneapi_srgb, AspectExt_oneapi_native_assert, - AspectExt_intel_legacy_image, AspectExt_oneapi_ballot_group, - AspectExt_oneapi_fixed_size_group, AspectExt_oneapi_opportunistic_group, - AspectExt_oneapi_tangle_group, AspectExt_oneapi_private_alloca, + AspectExt_intel_legacy_image, AspectExt_oneapi_fragment, + AspectExt_oneapi_chunk, + AspectExt_oneapi_tangle, AspectExt_oneapi_private_alloca, AspectOnline_compiler, AspectOnline_linker, AspectExt_intel_gpu_slices, AspectExt_intel_gpu_subslices_per_slice, AspectExt_intel_gpu_eu_count_per_subslice, AspectExt_intel_gpu_hw_threads_per_eu, AspectExt_intel_device_id, @@ -200,9 +199,9 @@ defvar Fp16Fp64Atomic64 = [AspectFp16, AspectFp64, AspectAtomic64]; defvar Fp16Atomic64 = [AspectFp16, AspectAtomic64]; defvar Sg8_16_32 = [8, 16, 32]; defvar Sg16_32 = [16, 32]; -defvar IntelGPUBaseAspects = [AspectExt_intel_esimd, AspectExt_oneapi_ballot_group, - AspectExt_oneapi_fixed_size_group, AspectExt_oneapi_opportunistic_group, - AspectExt_oneapi_tangle_group]; +defvar IntelGPUBaseAspects = [AspectExt_intel_esimd, AspectExt_oneapi_fragment, + AspectExt_oneapi_chunk, + AspectExt_oneapi_tangle]; class IntelTargetInfo Aspects, list subGroupSizesList> : TargetInfo; // Note: only the "canonical" target names are listed here - see @@ -276,8 +275,8 @@ defvar CudaSM90USMAspects = [AspectUsm_system_allocations, AspectUsm_atomic_host defvar CudaMinAspects = !listconcat(CudaMinUSMAspects, [AspectGpu, AspectFp64, AspectOnline_compiler, AspectOnline_linker, AspectQueue_profiling, AspectExt_intel_pci_address, AspectExt_intel_memory_bus_width, AspectExt_intel_device_info_uuid, AspectExt_oneapi_native_assert, AspectExt_intel_free_memory, AspectExt_intel_device_id, - AspectExt_intel_memory_clock_rate, AspectExt_oneapi_ballot_group, AspectExt_oneapi_fixed_size_group, - AspectExt_oneapi_opportunistic_group, AspectExt_oneapi_graph, AspectExt_oneapi_limited_graph]); + AspectExt_intel_memory_clock_rate, AspectExt_oneapi_fragment, AspectExt_oneapi_chunk, + AspectExt_oneapi_tangle, AspectExt_oneapi_graph, AspectExt_oneapi_limited_graph]); // Bindless images aspects are partially supported on CUDA and disabled by default at the moment. defvar CudaBindlessImagesAspects = [AspectExt_oneapi_bindless_images, AspectExt_oneapi_bindless_images_shared_usm, AspectExt_oneapi_bindless_images_1d_usm, AspectExt_oneapi_bindless_images_2d_usm, AspectExt_oneapi_external_memory_import, diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index cbdc1e7e0ffce..163334e5a7969 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -32,11 +32,13 @@ namespace ext { namespace oneapi { struct sub_group; namespace experimental { -template class ballot_group; +template class fragment; template class fixed_size_group; template class root_group; -template class tangle_group; -class opportunistic_group; +template class tangle; +template class fragment; +template class chunk; +// opportunistic_group merged into fragment } // namespace experimental } // namespace oneapi } // namespace ext @@ -64,23 +66,16 @@ inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id); namespace spirv { -template -struct is_tangle_or_opportunistic_group : std::false_type {}; +template struct is_tangle_group : std::false_type {}; template -struct is_tangle_or_opportunistic_group< - sycl::ext::oneapi::experimental::tangle_group> +struct is_tangle_group> : std::true_type {}; -template <> -struct is_tangle_or_opportunistic_group< - sycl::ext::oneapi::experimental::opportunistic_group> : std::true_type {}; - template struct is_ballot_group : std::false_type {}; template -struct is_ballot_group< - sycl::ext::oneapi::experimental::ballot_group> +struct is_ballot_group> : std::true_type {}; template struct is_fixed_size_group : std::false_type {}; @@ -89,6 +84,18 @@ template struct is_fixed_size_group> : std::true_type {}; +template struct is_fragment : std::false_type {}; + +template +struct is_fragment> + : std::true_type {}; + +template struct is_chunk : std::false_type {}; + +template +struct is_chunk> + : std::true_type {}; + template struct group_scope {}; template @@ -108,7 +115,7 @@ template <> struct group_scope<::sycl::sub_group> { }; template -struct group_scope> { +struct group_scope> { static constexpr __spv::Scope::Flag value = group_scope::value; }; @@ -119,13 +126,14 @@ struct group_scope -struct group_scope> { +struct group_scope> { static constexpr __spv::Scope::Flag value = group_scope::value; }; -template <> -struct group_scope<::sycl::ext::oneapi::experimental::opportunistic_group> { - static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Subgroup; +template +struct group_scope< + sycl::ext::oneapi::experimental::chunk> { + static constexpr __spv::Scope::Flag value = group_scope::value; }; // Generic shuffles and broadcasts may require multiple calls to @@ -170,9 +178,8 @@ template bool GroupAll(Group, bool pred) { return __spirv_GroupAll(group_scope::value, pred); } template -bool GroupAll(ext::oneapi::experimental::ballot_group g, - bool pred) { - // ballot_group partitions its parent into two groups (0 and 1) +bool GroupAll(ext::oneapi::experimental::fragment g, bool pred) { + // fragment partitions its parent into two groups (0 and 1) // We have to force each group down different control flow // Work-items in the "false" group (0) may still be active if (g.get_group_id() == 1) { @@ -192,23 +199,27 @@ bool GroupAll( static_cast(pred), PartitionSize); } template -bool GroupAll(ext::oneapi::experimental::tangle_group, bool pred) { +bool GroupAll(ext::oneapi::experimental::tangle, bool pred) { return __spirv_GroupNonUniformAll(group_scope::value, pred); } -bool GroupAll(const ext::oneapi::experimental::opportunistic_group &, +template +bool GroupAll(ext::oneapi::experimental::chunk, bool pred) { - return __spirv_GroupNonUniformAll( - group_scope::value, pred); + // Using reduction becaue the GroupNonUniformAll have no support of cluster + // size + return __spirv_GroupNonUniformBitwiseAnd( + group_scope::value, + static_cast(__spv::GroupOperation::ClusteredReduce), + static_cast(pred), ChunkSize); } template bool GroupAny(Group, bool pred) { return __spirv_GroupAny(group_scope::value, pred); } template -bool GroupAny(ext::oneapi::experimental::ballot_group g, - bool pred) { - // ballot_group partitions its parent into two groups (0 and 1) +bool GroupAny(ext::oneapi::experimental::fragment g, bool pred) { + // fragment partitions its parent into two groups (0 and 1) // We have to force each group down different control flow // Work-items in the "false" group (0) may still be active if (g.get_group_id() == 1) { @@ -228,15 +239,23 @@ bool GroupAny( static_cast(pred), PartitionSize); } template -bool GroupAny(ext::oneapi::experimental::tangle_group, bool pred) { +bool GroupAny(ext::oneapi::experimental::tangle, bool pred) { return __spirv_GroupNonUniformAny(group_scope::value, pred); } -bool GroupAny(const ext::oneapi::experimental::opportunistic_group &, + +template +bool GroupAny(ext::oneapi::experimental::chunk, bool pred) { - return __spirv_GroupNonUniformAny( - group_scope::value, pred); + // Using reduction becaue the GroupNonUniformAll have no support of cluster + // size + return __spirv_GroupNonUniformBitwiseOr( + group_scope::value, + static_cast(__spv::GroupOperation::ClusteredReduce), + static_cast(pred), ChunkSize); } +// opportunistic_group merged into fragment + // Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic // FIXME: Do not special-case for half or vec once all backends support all data // types. @@ -312,8 +331,8 @@ EnableIfNativeBroadcast GroupBroadcast(Group, T x, IdT local_id) { template EnableIfNativeBroadcast -GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, - T x, IdT local_id) { +GroupBroadcast(sycl::ext::oneapi::experimental::fragment g, T x, + IdT local_id) { // Remap local_id to its original numbering in ParentGroup. auto LocalId = detail::IdToMaskPosition(g, local_id); @@ -323,7 +342,7 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, WidenOpenCLTypeTo32_t WideOCLX = OCLX; auto OCLId = detail::convertToOpenCLType(GroupLocalId); - // ballot_group partitions its parent into two groups (0 and 1) + // fragment partitions its parent into two groups (0 and 1) // We have to force each group down different control flow // Work-items in the "false" group (0) may still be active if (g.get_group_id() == 1) { @@ -356,7 +375,7 @@ EnableIfNativeBroadcast GroupBroadcast( } template EnableIfNativeBroadcast -GroupBroadcast(ext::oneapi::experimental::tangle_group g, T x, +GroupBroadcast(ext::oneapi::experimental::tangle g, T x, IdT local_id) { // Remap local_id to its original numbering in ParentGroup. auto LocalId = detail::IdToMaskPosition(g, local_id); @@ -370,25 +389,31 @@ GroupBroadcast(ext::oneapi::experimental::tangle_group g, T x, return __spirv_GroupNonUniformBroadcast(group_scope::value, WideOCLX, OCLId); } -template + +template EnableIfNativeBroadcast -GroupBroadcast(const ext::oneapi::experimental::opportunistic_group &g, T x, +GroupBroadcast(ext::oneapi::experimental::chunk g, T x, IdT local_id) { - // Remap local_id to its original numbering in sub-group - auto LocalId = detail::IdToMaskPosition(g, local_id); + // Remap local_id to its original numbering in ParentGroup + auto LocalId = g.get_group_linear_id() * ChunkSize + local_id; // TODO: Refactor to avoid duplication after design settles. - auto GroupLocalId = - static_cast::type>(LocalId); + auto GroupLocalId = static_cast::type>(LocalId); auto OCLX = detail::convertToOpenCLType(x); WidenOpenCLTypeTo32_t WideOCLX = OCLX; auto OCLId = detail::convertToOpenCLType(GroupLocalId); - return __spirv_GroupNonUniformBroadcast( - group_scope::value, - WideOCLX, OCLId); + // NonUniformBroadcast requires Id to be dynamically uniform, which does not + // hold here; each partition is broadcasting a separate index. + // In case of falling back to either NonUniformShuffle or a + // NonUniformBroadcast per partition, and it's unclear which will be faster in + // practice. + return __spirv_GroupNonUniformShuffle(group_scope::value, + WideOCLX, OCLId); } +// opportunistic_group merged into fragment + template EnableIfBitcastBroadcast GroupBroadcast(Group g, T x, IdT local_id) { using BroadcastT = ConvertToNativeBroadcastType_t; @@ -431,8 +456,8 @@ EnableIfNativeBroadcast GroupBroadcast(Group g, T x, } template EnableIfNativeBroadcast -GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, - T x, id<1> local_id) { +GroupBroadcast(sycl::ext::oneapi::experimental::fragment g, T x, + id<1> local_id) { // Limited to 1D indices for now because ParentGroup must be sub-group. return GroupBroadcast(g, x, local_id[0]); } @@ -903,10 +928,11 @@ inline uint32_t membermask() { template inline uint32_t MapShuffleID(GroupT g, id<1> local_id) { - if constexpr (is_tangle_or_opportunistic_group::value || - is_ballot_group::value) + if constexpr (is_tangle_group::value || + is_ballot_group::value || is_fragment::value) return detail::IdToMaskPosition(g, local_id); - else if constexpr (is_fixed_size_group::value) + else if constexpr (is_fixed_size_group::value || + is_chunk::value) return g.get_group_linear_id() * g.get_local_range().size() + local_id; else return local_id.get(0); @@ -1276,7 +1302,7 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { template <__spv::GroupOperation Op, typename Group, typename T> \ inline typename std::enable_if_t< \ ext::oneapi::experimental::is_fixed_topology_group_v, T> \ - Group##Instruction(Group, T x) { \ + Group##Instruction(Group, T x) { \ using ConvertedT = detail::ConvertToOpenCLType_t; \ \ using OCLT = std::conditional_t< \ @@ -1294,7 +1320,7 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { \ template <__spv::GroupOperation Op, typename ParentGroup, typename T> \ inline T Group##Instruction( \ - ext::oneapi::experimental::ballot_group g, T x) { \ + ext::oneapi::experimental::fragment g, T x) { \ using ConvertedT = detail::ConvertToOpenCLType_t; \ \ using OCLT = std::conditional_t< \ @@ -1305,7 +1331,7 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { std::is_same(), \ opencl::cl_uint, ConvertedT>>; \ OCLT Arg = x; \ - /* ballot_group partitions its parent into two groups (0 and 1) */ \ + /* fragment partitions its parent into two groups (0 and 1) */ \ /* We have to force each group down different control flow */ \ /* Work-items in the "false" group (0) may still be active */ \ constexpr auto Scope = group_scope::value; \ @@ -1353,10 +1379,44 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { return tmp; \ } \ } \ + template <__spv::GroupOperation Op, size_t ChunkSize, typename ParentGroup, \ + typename T> \ + inline T Group##Instruction( \ + ext::oneapi::experimental::chunk g, T x) { \ + using ConvertedT = detail::ConvertToOpenCLType_t; \ + \ + using OCLT = std::conditional_t< \ + std::is_same() || \ + std::is_same(), \ + opencl::cl_int, \ + std::conditional_t() || \ + std::is_same(), \ + opencl::cl_uint, ConvertedT>>; \ + OCLT Arg = x; \ + constexpr auto Scope = group_scope::value; \ + \ + /* SPIR-V only defines a ClusteredReduce, with no equivalents for scan. */ \ + /* Emulate Clustered*Scan using control flow to separate clusters. */ \ + if constexpr (Op == __spv::GroupOperation::Reduce) { \ + constexpr auto OpInt = \ + static_cast(__spv::GroupOperation::ClusteredReduce); \ + return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg, \ + ChunkSize); \ + } else { \ + T tmp; \ + for (size_t Cluster = 0; Cluster < g.get_group_linear_range(); \ + ++Cluster) { \ + if (Cluster == g.get_group_linear_id()) { \ + constexpr auto OpInt = static_cast(Op); \ + tmp = __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \ + } \ + } \ + return tmp; \ + } \ + } \ template <__spv::GroupOperation Op, typename Group, typename T> \ - inline typename std::enable_if_t< \ - is_tangle_or_opportunistic_group::value, T> \ - Group##Instruction(Group, T x) { \ + inline typename std::enable_if_t::value, T> \ + Group##Instruction(Group, T x) { \ using ConvertedT = detail::ConvertToOpenCLType_t; \ \ using OCLT = std::conditional_t< \ diff --git a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp deleted file mode 100644 index 7a6f51cacc39e..0000000000000 --- a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp +++ /dev/null @@ -1,182 +0,0 @@ -//==------ ballot_group.hpp --- SYCL extension for non-uniform groups ------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include -#include -#include // for is_group, is_user_cons... -#include -#include // for GetMask -#include // for sub_group_mask -#include // for id -#include // for memory_scope -#include // for range -#include // for sub_group - -#include // for enable_if_t, decay_t - -namespace sycl { -inline namespace _V1 { -namespace ext::oneapi::experimental { - -template class ballot_group; - -template -#ifdef __SYCL_DEVICE_ONLY__ -[[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_ballot_group)]] -#endif -inline std::enable_if_t> && - std::is_same_v, - ballot_group> get_ballot_group(Group group, - bool predicate); - -template class ballot_group { -public: - using id_type = id<1>; - using range_type = range<1>; - using linear_id_type = typename ParentGroup::linear_id_type; - static constexpr int dimensions = 1; - static constexpr sycl::memory_scope fence_scope = ParentGroup::fence_scope; - - id_type get_group_id() const { -#ifdef __SYCL_DEVICE_ONLY__ - return (Predicate) ? 1 : 0; -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif - } - - id_type get_local_id() const { -#ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::CallerPositionInMask(Mask); -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif - } - - range_type get_group_range() const { -#ifdef __SYCL_DEVICE_ONLY__ - return 2; -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif - } - - range_type get_local_range() const { -#ifdef __SYCL_DEVICE_ONLY__ - return Mask.count(); -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif - } - - linear_id_type get_group_linear_id() const { -#ifdef __SYCL_DEVICE_ONLY__ - return static_cast(get_group_id()[0]); -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif - } - - linear_id_type get_local_linear_id() const { -#ifdef __SYCL_DEVICE_ONLY__ - return static_cast(get_local_id()[0]); -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif - } - - linear_id_type get_group_linear_range() const { -#ifdef __SYCL_DEVICE_ONLY__ - return static_cast(get_group_range()[0]); -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif - } - - linear_id_type get_local_linear_range() const { -#ifdef __SYCL_DEVICE_ONLY__ - return static_cast(get_local_range()[0]); -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif - } - - bool leader() const { -#ifdef __SYCL_DEVICE_ONLY__ - uint32_t Lowest = static_cast(Mask.find_low()[0]); - return __spirv_SubgroupLocalInvocationId() == Lowest; -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif - } - -protected: - const sub_group_mask Mask; - const bool Predicate; - - ballot_group(sub_group_mask m, bool p) : Mask(m), Predicate(p) {} - - friend ballot_group - get_ballot_group(ParentGroup g, bool predicate); - - friend sub_group_mask sycl::detail::GetMask>( - ballot_group Group); -}; - -template -inline std::enable_if_t> && - std::is_same_v, - ballot_group> -get_ballot_group(Group group, bool predicate) { - (void)group; -#ifdef __SYCL_DEVICE_ONLY__ -#if defined(__SPIR__) || defined(__SPIRV__) || defined(__NVPTX__) - // ballot_group partitions into two groups using the predicate - // Membership mask for one group is negation of the other - sub_group_mask mask = sycl::ext::oneapi::group_ballot(group, predicate); - if (predicate) { - return ballot_group(mask, predicate); - } else { - // To negate the mask for the false-predicate group, we also need to exclude - // all parts of the mask that is not part of the group. - sub_group_mask::BitsType participant_filter = - (~sub_group_mask::BitsType{0}) >> - (sub_group_mask::max_bits - group.get_local_linear_range()); - return ballot_group((~mask) & participant_filter, - predicate); - } -#endif -#else - (void)predicate; - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif -} - -template -struct is_user_constructed_group> : std::true_type {}; - -} // namespace ext::oneapi::experimental - -template -struct is_group> - : std::true_type {}; - -} // namespace _V1 -} // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp b/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp new file mode 100644 index 0000000000000..f94b7bec655e4 --- /dev/null +++ b/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp @@ -0,0 +1,231 @@ +//==----------- chunk.hpp --- SYCL extension for non-uniform groups --------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace sycl { +inline namespace _V1 { +namespace ext::oneapi::experimental { + +template class chunk; +template class fragment; + +template +#ifdef __SYCL_DEVICE_ONLY__ +[[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_chunk)]] +#endif +inline std::enable_if_t> && + std::is_same_v, + chunk> +chunked_partition(ParentGroup parent); + +template class chunk { +public: + using id_type = id<1>; + using range_type = range<1>; + using linear_id_type = typename ParentGroup::linear_id_type; + static constexpr int dimensions = 1; + static constexpr sycl::memory_scope fence_scope = ParentGroup::fence_scope; + + inline operator fragment() const; + + id_type get_group_id() const { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_SubgroupLocalInvocationId() / ChunkSize; +#else + throw exception(make_error_code(errc::runtime), + "Non-uniform groups are not supported on host."); +#endif + } + + id_type get_local_id() const { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_SubgroupLocalInvocationId() % ChunkSize; +#else + throw exception(make_error_code(errc::runtime), + "Non-uniform groups are not supported on host."); +#endif + } + + range_type get_group_range() const { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_SubgroupSize() / ChunkSize; +#else + throw exception(make_error_code(errc::runtime), + "Non-uniform groups are not supported on host."); +#endif + } + + range_type get_local_range() const { +#ifdef __SYCL_DEVICE_ONLY__ + return ChunkSize; +#else + throw exception(make_error_code(errc::runtime), + "Non-uniform groups are not supported on host."); +#endif + } + + linear_id_type get_group_linear_id() const { +#ifdef __SYCL_DEVICE_ONLY__ + return static_cast(get_group_id()[0]); +#else + throw exception(make_error_code(errc::runtime), + "Non-uniform groups are not supported on host."); +#endif + } + + linear_id_type get_local_linear_id() const { +#ifdef __SYCL_DEVICE_ONLY__ + return static_cast(get_local_id()[0]); +#else + throw exception(make_error_code(errc::runtime), + "Non-uniform groups are not supported on host."); +#endif + } + + linear_id_type get_group_linear_range() const { +#ifdef __SYCL_DEVICE_ONLY__ + return static_cast(get_group_range()[0]); +#else + throw exception(make_error_code(errc::runtime), + "Non-uniform groups are not supported on host."); +#endif + } + + linear_id_type get_local_linear_range() const { +#ifdef __SYCL_DEVICE_ONLY__ + return static_cast(get_local_range()[0]); +#else + throw exception(make_error_code(errc::runtime), + "Non-uniform groups are not supported on host."); +#endif + } + + bool leader() const { +#ifdef __SYCL_DEVICE_ONLY__ + return get_local_linear_id() == 0; +#else + throw exception(make_error_code(errc::runtime), + "Non-uniform groups are not supported on host."); +#endif + } + +protected: +#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) + sub_group_mask Mask; +#endif + +#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) + chunk(ext::oneapi::sub_group_mask mask) : Mask(mask) {} +#else + chunk() {} +#endif + + friend chunk + chunked_partition(ParentGroup parent); + + friend sub_group_mask sycl::detail::GetMask>( + chunk Group); +}; + +// Chunked partition implementation +template +inline std::enable_if_t> && + std::is_same_v, + chunk> +chunked_partition(ParentGroup parent) { + (void)parent; +#ifdef __SYCL_DEVICE_ONLY__ + // sync all work-items in parent group before partitioning + sycl::group_barrier(parent); + +#if defined(__NVPTX__) + uint32_t loc_id = parent.get_local_linear_id(); + uint32_t loc_size = parent.get_local_linear_range(); + uint32_t bits = ChunkSize == 32 ? 0xffffffff + : ((1 << ChunkSize) - 1) + << ((loc_id / ChunkSize) * ChunkSize); + + return chunk( + sycl::detail::Builder::createSubGroupMask( + bits, loc_size)); +#else + return chunk(); +#endif +#else + throw exception(make_error_code(errc::runtime), + "Non-uniform groups are not supported on host."); +#endif +} + +// Type traits +template +struct is_user_constructed_group> + : std::true_type {}; + +template +struct is_chunk> : std::true_type {}; + +} // namespace ext::oneapi::experimental + +template +struct is_group> + : std::true_type {}; + +} // namespace _V1 +} // namespace sycl + +// chunk->fragment conversion +// must be defined after fragment class is available +namespace sycl { +inline namespace _V1 { +namespace ext::oneapi::experimental { + +template +inline chunk::operator fragment() const { +#ifdef __SYCL_DEVICE_ONLY__ +#if defined(__NVPTX__) + // make fragment from chunk's mask and properties + return fragment(Mask, get_group_id(), get_group_range()); +#else + // or mask based on chunk membership for non-NVPTX devices + uint32_t loc_id = __spirv_SubgroupLocalInvocationId(); + uint32_t chunk_start = (loc_id / ChunkSize) * ChunkSize; + sub_group_mask::BitsType bits = + ChunkSize == 32 + ? sub_group_mask::BitsType(~0) + : ((sub_group_mask::BitsType(1) << ChunkSize) - 1) << chunk_start; + sub_group_mask mask = + sycl::detail::Builder::createSubGroupMask( + bits, __spirv_SubgroupSize()); + return fragment(mask, get_group_id(), get_group_range()); +#endif +#else + throw exception(make_error_code(errc::runtime), + "Non-uniform groups are not supported on host."); +#endif +} + +} // namespace ext::oneapi::experimental +} // namespace _V1 +} // namespace sycl \ No newline at end of file diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp deleted file mode 100644 index 56f30f2091a6b..0000000000000 --- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp +++ /dev/null @@ -1,194 +0,0 @@ -//==--- fixed_size_group.hpp --- SYCL extension for non-uniform groups -----==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include -#include -#include // for is_fixed_size_group, is_group -#include -#include -#include // for sub_group_mask -#include // for id -#include // for memory_scope -#include // for range -#include // for sub_group - -#include // for size_t -#include // for enable_if_t, true_type, dec... - -namespace sycl { -inline namespace _V1 { -namespace ext::oneapi::experimental { - -template class fixed_size_group; - -template -#ifdef __SYCL_DEVICE_ONLY__ -[[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_fixed_size_group)]] -#endif -inline std::enable_if_t> && - std::is_same_v, - fixed_size_group> -get_fixed_size_group(Group group); - -template class fixed_size_group { -public: - using id_type = id<1>; - using range_type = range<1>; - using linear_id_type = typename ParentGroup::linear_id_type; - static constexpr int dimensions = 1; - static constexpr sycl::memory_scope fence_scope = ParentGroup::fence_scope; - - id_type get_group_id() const { -#ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupLocalInvocationId() / PartitionSize; -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif - } - - id_type get_local_id() const { -#ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupLocalInvocationId() % PartitionSize; -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif - } - - range_type get_group_range() const { -#ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupSize() / PartitionSize; -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif - } - - range_type get_local_range() const { -#ifdef __SYCL_DEVICE_ONLY__ - return PartitionSize; -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif - } - - linear_id_type get_group_linear_id() const { -#ifdef __SYCL_DEVICE_ONLY__ - return static_cast(get_group_id()[0]); -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif - } - - linear_id_type get_local_linear_id() const { -#ifdef __SYCL_DEVICE_ONLY__ - return static_cast(get_local_id()[0]); -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif - } - - linear_id_type get_group_linear_range() const { -#ifdef __SYCL_DEVICE_ONLY__ - return static_cast(get_group_range()[0]); -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif - } - - linear_id_type get_local_linear_range() const { -#ifdef __SYCL_DEVICE_ONLY__ - return static_cast(get_local_range()[0]); -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif - } - - bool leader() const { -#ifdef __SYCL_DEVICE_ONLY__ - return get_local_linear_id() == 0; -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif - } - -protected: -#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) - sub_group_mask Mask; -#endif - -#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) - fixed_size_group(ext::oneapi::sub_group_mask mask) : Mask(mask) {} -#else - fixed_size_group() {} -#endif - - friend fixed_size_group - get_fixed_size_group(ParentGroup g); - - friend sub_group_mask - sycl::detail::GetMask>( - fixed_size_group Group); -}; - -template -inline std::enable_if_t> && - std::is_same_v, - fixed_size_group> -get_fixed_size_group(Group group) { - (void)group; -#ifdef __SYCL_DEVICE_ONLY__ -#if defined(__NVPTX__) - uint32_t loc_id = group.get_local_linear_id(); - uint32_t loc_size = group.get_local_linear_range(); - uint32_t bits = PartitionSize == 32 - ? 0xffffffff - : ((1 << PartitionSize) - 1) - << ((loc_id / PartitionSize) * PartitionSize); - - return fixed_size_group( - sycl::detail::Builder::createSubGroupMask( - bits, loc_size)); -#else - return fixed_size_group(); -#endif -#else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); -#endif -} - -template -struct is_user_constructed_group> - : std::true_type {}; - -} // namespace ext::oneapi::experimental - -namespace detail { -template -struct is_fixed_size_group< - ext::oneapi::experimental::fixed_size_group> - : std::true_type {}; -} // namespace detail - -template -struct is_group< - ext::oneapi::experimental::fixed_size_group> - : std::true_type {}; - -} // namespace _V1 -} // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp similarity index 53% rename from sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp rename to sycl/include/sycl/ext/oneapi/experimental/fragment.hpp index cbe8de03a9963..a89acdbabe7c2 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp @@ -1,4 +1,4 @@ -//==--- opportunistic_group.hpp --- SYCL extension for non-uniform groups --==// +//==---------- fragment.hpp --- SYCL extension for non-uniform groups ------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -10,49 +10,57 @@ #include #include -#include // for is_group, is_user_cons... +#include #include #include -#include // for this_sub_group -#include // for sub_group_mask -#include // for id -#include // for memory_scope -#include // for range +#include +#include +#include +#include +#include +#include #include #ifdef __SYCL_DEVICE_ONLY__ #include #endif -#include // for uint32_t -#include // for true_type +#include +#include namespace sycl { inline namespace _V1 { namespace ext::oneapi::experimental { -class opportunistic_group; +template class fragment; -namespace this_kernel { +template #ifdef __SYCL_DEVICE_ONLY__ -[[__sycl_detail__::__uses_aspects__( - sycl::aspect::ext_oneapi_opportunistic_group)]] +[[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_fragment)]] #endif -inline opportunistic_group get_opportunistic_group(); -} // namespace this_kernel +inline std::enable_if_t> && + std::is_same_v, + fragment> +binary_partition(ParentGroup parent, bool predicate); -class opportunistic_group { +namespace this_work_item { +#ifdef __SYCL_DEVICE_ONLY__ +[[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_fragment)]] +#endif +inline fragment get_opportunistic_group(); +} // namespace this_work_item + +template class fragment { public: using id_type = id<1>; using range_type = range<1>; - using linear_id_type = uint32_t; + using linear_id_type = typename ParentGroup::linear_id_type; static constexpr int dimensions = 1; - static constexpr sycl::memory_scope fence_scope = - sycl::memory_scope::sub_group; + static constexpr sycl::memory_scope fence_scope = ParentGroup::fence_scope; id_type get_group_id() const { #ifdef __SYCL_DEVICE_ONLY__ - return static_cast(0); + return GroupID; #else throw exception(make_error_code(errc::runtime), "Non-uniform groups are not supported on host."); @@ -70,7 +78,7 @@ class opportunistic_group { range_type get_group_range() const { #ifdef __SYCL_DEVICE_ONLY__ - return 1; + return GroupRange; #else throw exception(make_error_code(errc::runtime), "Non-uniform groups are not supported on host."); @@ -134,31 +142,67 @@ class opportunistic_group { protected: sub_group_mask Mask; + id_type GroupID; + range_type GroupRange; + + fragment(sub_group_mask m, id_type group_id, range_type group_range) + : Mask(m), GroupID(group_id), GroupRange(group_range) {} - opportunistic_group(sub_group_mask m) : Mask(m) {} + friend fragment binary_partition(ParentGroup parent, + bool predicate); - friend opportunistic_group this_kernel::get_opportunistic_group(); + friend fragment this_work_item::get_opportunistic_group(); friend sub_group_mask - sycl::detail::GetMask(opportunistic_group Group); + sycl::detail::GetMask>(fragment Group); }; -namespace this_kernel { +template +inline std::enable_if_t> && + std::is_same_v, + fragment> +binary_partition(ParentGroup parent, bool predicate) { + (void)parent; +#ifdef __SYCL_DEVICE_ONLY__ + // sync all work-items in parent group before partitioning + sycl::group_barrier(parent); + +#if defined(__SPIR__) || defined(__SPIRV__) || defined(__NVPTX__) + sub_group_mask mask = sycl::ext::oneapi::group_ballot(parent, predicate); + id<1> group_id = predicate ? 1 : 0; + range<1> group_range = 2; // 2 groupds based on predicate by binary_partition + + if (!predicate) { + sub_group_mask::BitsType participant_filter = + (~sub_group_mask::BitsType{0}) >> + (sub_group_mask::max_bits - parent.get_local_linear_range()); + mask = (~mask) & participant_filter; + } -inline opportunistic_group get_opportunistic_group() { + return fragment(mask, group_id, group_range); +#endif +#else + (void)predicate; + throw exception(make_error_code(errc::runtime), + "Non-uniform groups are not supported on host."); +#endif +} + +namespace this_work_item { + +inline fragment get_opportunistic_group() { #ifdef __SYCL_DEVICE_ONLY__ #if defined(__SPIR__) || defined(__SPIRV__) - // TODO: It may be wiser to call the intrinsic than rely on this_group() sycl::sub_group sg = sycl::ext::oneapi::experimental::this_sub_group(); sub_group_mask mask = sycl::ext::oneapi::group_ballot(sg, true); - return opportunistic_group(mask); + return fragment(mask, 0, 1); #elif defined(__NVPTX__) uint32_t active_mask; asm volatile("activemask.b32 %0;" : "=r"(active_mask)); sub_group_mask mask = sycl::detail::Builder::createSubGroupMask( active_mask, 32); - return opportunistic_group(mask); + return fragment(mask, 0, 1); #endif #else throw exception(make_error_code(errc::runtime), @@ -166,16 +210,19 @@ inline opportunistic_group get_opportunistic_group() { #endif } -} // namespace this_kernel +} // namespace this_work_item + +template +struct is_user_constructed_group> : std::true_type {}; -template <> -struct is_user_constructed_group : std::true_type {}; +template +struct is_fragment> : std::true_type {}; } // namespace ext::oneapi::experimental -template <> -struct is_group +template +struct is_group> : std::true_type {}; } // namespace _V1 -} // namespace sycl +} // namespace sycl \ No newline at end of file diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index bbe619834dcdc..98aa0dab2a071 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -12,8 +12,9 @@ #include // for marray #include // for vec -#include // for size_t -#include // for uint32_t +#include // for size_t +#include // for uint32_t +#include // for false_type namespace sycl { inline namespace _V1 { @@ -72,10 +73,23 @@ inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) { namespace ext::oneapi::experimental { // Forward declarations of non-uniform group types for algorithm definitions -template class ballot_group; -template class fixed_size_group; -template class tangle_group; -class opportunistic_group; +template class fragment; +template class chunk; +template class tangle; + +// Type trait helpers +template struct is_chunk : std::false_type {}; + +template inline constexpr bool is_chunk_v = is_chunk::value; + +template struct is_fragment : std::false_type {}; + +template +inline constexpr bool is_fragment_v = is_fragment::value; + +template struct is_tangle : std::false_type {}; + +template inline constexpr bool is_tangle_v = is_tangle::value; } // namespace ext::oneapi::experimental diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp similarity index 64% rename from sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp rename to sycl/include/sycl/ext/oneapi/experimental/tangle.hpp index 59af6bdfc753b..726fba7b8bde9 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp @@ -1,4 +1,5 @@ -//==------ tangle_group.hpp --- SYCL extension for non-uniform groups ------==// +//==------------- tangle.hpp --- SYCL extension for non-uniform groups +//------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -10,32 +11,31 @@ #include #include -#include // for is_group, is_user_cons... +#include #include #include -#include // for sub_group_mask -#include // for id -#include // for memory_scope -#include // for range -#include // for sub_group - +#include +#include +#include // for id +#include // for memory_scope +#include +#include #include // for enable_if_t, decay_t namespace sycl { inline namespace _V1 { namespace ext::oneapi::experimental { -template class tangle_group; - -template +template class tangle; +template #ifdef __SYCL_DEVICE_ONLY__ -[[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_tangle_group)]] +[[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_tangle)]] #endif -inline std::enable_if_t> && - std::is_same_v, - tangle_group> get_tangle_group(Group group); +inline std::enable_if_t> && + std::is_same_v, + tangle> entangle(ParentGroup parent); -template class tangle_group { +template class tangle { public: using id_type = id<1>; using range_type = range<1>; @@ -126,50 +126,57 @@ template class tangle_group { } protected: +#ifdef __SYCL_DEVICE_ONLY__ sub_group_mask Mask; +#endif - tangle_group(sub_group_mask m) : Mask(m) {} +#ifdef __SYCL_DEVICE_ONLY__ + tangle(ext::oneapi::sub_group_mask mask) : Mask(mask) {} +#else + tangle() {} +#endif - friend tangle_group get_tangle_group(ParentGroup); + friend tangle entangle(ParentGroup); - friend sub_group_mask sycl::detail::GetMask>( - tangle_group Group); + friend sub_group_mask + sycl::detail::GetMask>(tangle Group); }; -template -inline std::enable_if_t> && - std::is_same_v, - tangle_group> -get_tangle_group(Group group) { - (void)group; +template +inline std::enable_if_t> && + std::is_same_v, + tangle> +entangle(ParentGroup parent) { + (void)parent; #ifdef __SYCL_DEVICE_ONLY__ + // sync all work-items in parent group here + sycl::group_barrier(parent); + #if defined(__SPIR__) || defined(__SPIRV__) - // All SPIR-V devices that we currently target execute in SIMD fashion, - // and so the group of work-items in converged control flow is implicit. - // We store the mask here because it is required to calculate IDs, not - // because it is required to construct the group. - sub_group_mask mask = sycl::ext::oneapi::group_ballot(group, true); - return tangle_group(mask); + // mask is required to calculate IDs (not the group). + sub_group_mask mask = sycl::ext::oneapi::group_ballot(parent, true); + return tangle(mask); #elif defined(__NVPTX__) - // TODO: Construct from compiler-generated mask. Return an invalid group in - // in the meantime. CUDA devices will report false for the tangle_group - // support aspect so kernels launch should ensure this is never run. - return tangle_group(0); + // TODO: CUDA devices will report false for the tangle + // support aspect so kernels launch should ensure this is never run. + return tangle(0); #endif #else throw exception(make_error_code(errc::runtime), "Non-uniform groups are not supported on host."); #endif +} -} // namespace this_kernel +template +struct is_user_constructed_group> : std::true_type {}; template -struct is_user_constructed_group> : std::true_type {}; +struct is_tangle> : std::true_type {}; } // namespace ext::oneapi::experimental template -struct is_group> +struct is_group> : std::true_type {}; } // namespace _V1 diff --git a/sycl/include/sycl/info/aspects.def b/sycl/include/sycl/info/aspects.def index 457ea5628ed60..246bd7766e0d4 100644 --- a/sycl/include/sycl/info/aspects.def +++ b/sycl/include/sycl/info/aspects.def @@ -45,37 +45,36 @@ __SYCL_ASPECT(ext_oneapi_mipmap, 50) __SYCL_ASPECT(ext_oneapi_mipmap_anisotropy, 51) __SYCL_ASPECT(ext_oneapi_mipmap_level_reference, 52) __SYCL_ASPECT(ext_intel_esimd, 53) -__SYCL_ASPECT(ext_oneapi_ballot_group, 54) -__SYCL_ASPECT(ext_oneapi_fixed_size_group, 55) -__SYCL_ASPECT(ext_oneapi_opportunistic_group, 56) -__SYCL_ASPECT(ext_oneapi_tangle_group, 57) -__SYCL_ASPECT(ext_intel_matrix, 58) -__SYCL_ASPECT(ext_oneapi_is_composite, 59) -__SYCL_ASPECT(ext_oneapi_is_component, 60) -__SYCL_ASPECT(ext_oneapi_graph, 61) -__SYCL_ASPECT(ext_intel_fpga_task_sequence, 62) -__SYCL_ASPECT(ext_oneapi_limited_graph, 63) -__SYCL_ASPECT(ext_oneapi_private_alloca, 64) -__SYCL_ASPECT(ext_oneapi_cubemap, 65) -__SYCL_ASPECT(ext_oneapi_cubemap_seamless_filtering, 66) -__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_1d_usm, 67) -__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_1d, 68) -__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_2d_usm, 69) -__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_2d, 70) -__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_3d, 72) -__SYCL_ASPECT(ext_oneapi_queue_profiling_tag, 73) -__SYCL_ASPECT(ext_oneapi_virtual_mem, 74) -__SYCL_ASPECT(ext_oneapi_cuda_cluster_group, 75) -__SYCL_ASPECT(ext_oneapi_image_array, 76) -__SYCL_ASPECT(ext_oneapi_unique_addressing_per_dim, 77) -__SYCL_ASPECT(ext_oneapi_bindless_images_sample_1d_usm, 78) -__SYCL_ASPECT(ext_oneapi_bindless_images_sample_2d_usm, 79) -__SYCL_ASPECT(ext_oneapi_atomic16, 80) -__SYCL_ASPECT(ext_oneapi_virtual_functions, 81) -__SYCL_ASPECT(ext_intel_spill_memory_size, 82) -__SYCL_ASPECT(ext_oneapi_bindless_images_gather, 83) -__SYCL_ASPECT(ext_intel_current_clock_throttle_reasons, 84) -__SYCL_ASPECT(ext_intel_fan_speed, 85) -__SYCL_ASPECT(ext_intel_power_limits, 86) -__SYCL_ASPECT(ext_oneapi_async_memory_alloc, 87) +__SYCL_ASPECT(ext_oneapi_fragment, 54) +__SYCL_ASPECT(ext_oneapi_chunk, 55) +__SYCL_ASPECT(ext_oneapi_tangle, 56) +__SYCL_ASPECT(ext_intel_matrix, 57) +__SYCL_ASPECT(ext_oneapi_is_composite, 58) +__SYCL_ASPECT(ext_oneapi_is_component, 59) +__SYCL_ASPECT(ext_oneapi_graph, 60) +__SYCL_ASPECT(ext_intel_fpga_task_sequence, 61) +__SYCL_ASPECT(ext_oneapi_limited_graph, 62) +__SYCL_ASPECT(ext_oneapi_private_alloca, 63) +__SYCL_ASPECT(ext_oneapi_cubemap, 64) +__SYCL_ASPECT(ext_oneapi_cubemap_seamless_filtering, 65) +__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_1d_usm, 66) +__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_1d, 67) +__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_2d_usm, 68) +__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_2d, 69) +__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_3d, 70) +__SYCL_ASPECT(ext_oneapi_queue_profiling_tag, 71) +__SYCL_ASPECT(ext_oneapi_virtual_mem, 72) +__SYCL_ASPECT(ext_oneapi_cuda_cluster_group, 73) +__SYCL_ASPECT(ext_oneapi_image_array, 74) +__SYCL_ASPECT(ext_oneapi_unique_addressing_per_dim, 75) +__SYCL_ASPECT(ext_oneapi_bindless_images_sample_1d_usm, 76) +__SYCL_ASPECT(ext_oneapi_bindless_images_sample_2d_usm, 77) +__SYCL_ASPECT(ext_oneapi_atomic16, 78) +__SYCL_ASPECT(ext_oneapi_virtual_functions, 79) +__SYCL_ASPECT(ext_intel_spill_memory_size, 80) +__SYCL_ASPECT(ext_oneapi_bindless_images_gather, 81) +__SYCL_ASPECT(ext_intel_current_clock_throttle_reasons, 82) +__SYCL_ASPECT(ext_intel_fan_speed, 83) +__SYCL_ASPECT(ext_intel_power_limits, 84) +__SYCL_ASPECT(ext_oneapi_async_memory_alloc, 85) diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp index 8977a3166b163..01ae4295eaef0 100644 --- a/sycl/include/sycl/sycl.hpp +++ b/sycl/include/sycl/sycl.hpp @@ -86,27 +86,30 @@ #include #include #include -#include #include #include +#include #include #include #include +#include #include +#include #include -#include +#include #include +#include #include +#include #include #include -#include #include #include #include #include #include #include -#include +#include #include #include #include diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index a7729d19ce3e2..e9988510704df 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -109,7 +109,8 @@ device_impl::get_backend_info() const { bool device_impl::has_extension(const std::string &ExtensionName) const { std::string AllExtensionNames = get_info_impl(); - return (AllExtensionNames.find(ExtensionName) != std::string::npos); + return ((" " + AllExtensionNames + " ").find(" " + ExtensionName + " ") != + std::string::npos); } bool device_impl::is_partition_supported(info::partition_property Prop) const { diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp index 391bce575d8a7..fa22c8e5bf4ff 100644 --- a/sycl/source/detail/device_impl.hpp +++ b/sycl/source/detail/device_impl.hpp @@ -1433,26 +1433,30 @@ class device_impl : public std::enable_shared_from_this { CASE(ext_intel_esimd) { return get_info_impl_nocheck().value_or(0); } - CASE(ext_oneapi_ballot_group) { + CASE(ext_oneapi_fragment) { + // check actual OpenCL extensions: + if (this->getBackend() == backend::opencl) + return has_extension("cl_khr_subgroup_non_uniform_vote") || + has_extension("cl_khr_subgroup_ballot") || + has_extension("cl_intel_spirv_subgroups"); + // TODO: add corrcet checks for other backends return (this->getBackend() == backend::ext_oneapi_level_zero) || - (this->getBackend() == backend::opencl) || (this->getBackend() == backend::ext_oneapi_cuda); } - CASE(ext_oneapi_fixed_size_group) { + CASE(ext_oneapi_chunk) { + // Check for Intel subgroups extension that provides block read/write + if (this->getBackend() == backend::opencl) + return has_extension("cl_intel_subgroups"); + // TODO: add corrcet checks for other backends return (this->getBackend() == backend::ext_oneapi_level_zero) || - (this->getBackend() == backend::opencl) || (this->getBackend() == backend::ext_oneapi_cuda); } - CASE(ext_oneapi_opportunistic_group) { - return (this->getBackend() == backend::ext_oneapi_level_zero) || - (this->getBackend() == backend::opencl) || - (this->getBackend() == backend::ext_oneapi_cuda); - } - CASE(ext_oneapi_tangle_group) { - // TODO: tangle_group is not currently supported for CUDA devices. Add - // when implemented. - return (this->getBackend() == backend::ext_oneapi_level_zero) || - (this->getBackend() == backend::opencl); + CASE(ext_oneapi_tangle) { + // note: typically is available on newer GPUs - not on CPUs + if (this->getBackend() == backend::opencl) + return has_extension("cl_intel_subgroup_matrix_multiply_accumulate"); + // TODO: add proper checks for other backends + return (this->getBackend() == backend::ext_oneapi_level_zero); } CASE(ext_intel_matrix) { using arch = sycl::ext::oneapi::experimental::architecture; diff --git a/sycl/test-e2e/Assert/check_resource_leak.cpp b/sycl/test-e2e/Assert/check_resource_leak.cpp index 48c1b0eb54cf7..791b8c6d39c61 100644 --- a/sycl/test-e2e/Assert/check_resource_leak.cpp +++ b/sycl/test-e2e/Assert/check_resource_leak.cpp @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp b/sycl/test-e2e/NonUniformGroups/chunk.cpp similarity index 75% rename from sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp rename to sycl/test-e2e/NonUniformGroups/chunk.cpp index 199f6332f9783..25bef36cbfcdc 100644 --- a/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/chunk.cpp @@ -11,7 +11,7 @@ // REQUIRES: sg-32 #include -#include +#include #include namespace syclex = sycl::ext::oneapi::experimental; @@ -20,6 +20,12 @@ template class TestKernel; template void test() { sycl::queue Q; + if (!Q.get_device().has(sycl::aspect::ext_oneapi_chunk)) { + std::cout << "Device has no support for ext_oneapi_chunk aspect..." + << std::endl; + return; + } + // Test for both the full sub-group size and a case with less work than a full // sub-group. for (size_t WGS : std::array{32, 16}) { @@ -42,15 +48,15 @@ template void test() { auto SG = item.get_sub_group(); auto SGS = SG.get_local_linear_range(); - auto Partition = syclex::get_fixed_size_group(SG); + auto ChunkGroup = syclex::chunked_partition(SG); bool Match = true; - Match &= (Partition.get_group_id() == (WI / PartitionSize)); - Match &= (Partition.get_local_id() == (WI % PartitionSize)); - Match &= (Partition.get_group_range() == (SGS / PartitionSize)); - Match &= (Partition.get_local_range() == PartitionSize); + Match &= (ChunkGroup.get_group_id() == (WI / PartitionSize)); + Match &= (ChunkGroup.get_local_id() == (WI % PartitionSize)); + Match &= (ChunkGroup.get_group_range() == (SGS / PartitionSize)); + Match &= (ChunkGroup.get_local_range() == PartitionSize); MatchAcc[WI] = Match; - LeaderAcc[WI] = Partition.leader(); + LeaderAcc[WI] = ChunkGroup.leader(); }; CGH.parallel_for>(NDR, KernelFunc); }); diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp similarity index 85% rename from sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp rename to sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp index 5bd4bf09f7dce..cc24fab7be4bd 100644 --- a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp @@ -8,7 +8,7 @@ // // REQUIRES: cpu || gpu // REQUIRES: sg-32 -// REQUIRES: aspect-ext_oneapi_fixed_size_group +// REQUIRES: aspect-ext_oneapi_chunk // UNSUPPORTED: target-amd // UNSUPPORTED-INTENDED: fixed_size_group aspect not available on amd @@ -17,7 +17,7 @@ // The test is disabled for spirv-backend while we investigate the root cause. #include -#include +#include #include #include #include @@ -65,11 +65,11 @@ template void test() { auto SG = item.get_sub_group(); // Split into partitions of fixed size - auto Partition = syclex::get_fixed_size_group(SG); + auto ChunkGroup = syclex::chunked_partition(SG); // Check all other members' writes are visible after a barrier. TmpAcc[WI] = 1; - sycl::group_barrier(Partition); + sycl::group_barrier(ChunkGroup); size_t Visible = 0; for (size_t Other = 0; Other < SGSize; ++Other) { if ((WI / PartitionSize) == (Other / PartitionSize)) { @@ -80,26 +80,26 @@ template void test() { // Simple check of group algorithms. uint32_t OriginalLID = SG.get_local_linear_id(); - uint32_t LID = Partition.get_local_linear_id(); + uint32_t LID = ChunkGroup.get_local_linear_id(); uint32_t PartitionLeader = (OriginalLID / PartitionSize) * PartitionSize; uint32_t BroadcastResult = - sycl::group_broadcast(Partition, OriginalLID, 0); + sycl::group_broadcast(ChunkGroup, OriginalLID, 0); BroadcastAcc[WI] = (BroadcastResult == PartitionLeader); - bool AnyResult = sycl::any_of_group(Partition, (LID == 0)); + bool AnyResult = sycl::any_of_group(ChunkGroup, (LID == 0)); AnyAcc[WI] = (AnyResult == true); bool Predicate = ((OriginalLID / PartitionSize) % 2 == 0); - bool AllResult = sycl::all_of_group(Partition, Predicate); + bool AllResult = sycl::all_of_group(ChunkGroup, Predicate); if (Predicate) { AllAcc[WI] = (AllResult == true); } else { AllAcc[WI] = (AllResult == false); } - bool NoneResult = sycl::none_of_group(Partition, Predicate); + bool NoneResult = sycl::none_of_group(ChunkGroup, Predicate); if (Predicate) { NoneAcc[WI] = (NoneResult == false); } else { @@ -107,34 +107,34 @@ template void test() { } uint32_t ReduceResult = - sycl::reduce_over_group(Partition, 1, sycl::plus<>()); + sycl::reduce_over_group(ChunkGroup, 1, sycl::plus<>()); ReduceAcc[WI] = (ReduceResult == PartitionSize); uint32_t ExScanResult = - sycl::exclusive_scan_over_group(Partition, 1, sycl::plus<>()); + sycl::exclusive_scan_over_group(ChunkGroup, 1, sycl::plus<>()); ExScanAcc[WI] = (ExScanResult == LID); uint32_t IncScanResult = - sycl::inclusive_scan_over_group(Partition, 1, sycl::plus<>()); + sycl::inclusive_scan_over_group(ChunkGroup, 1, sycl::plus<>()); IncScanAcc[WI] = (IncScanResult == LID + 1); - uint32_t ShiftLeftResult = sycl::shift_group_left(Partition, LID, 2); + uint32_t ShiftLeftResult = sycl::shift_group_left(ChunkGroup, LID, 2); ShiftLeftAcc[WI] = (LID + 2 >= PartitionSize || ShiftLeftResult == LID + 2); uint32_t ShiftRightResult = - sycl::shift_group_right(Partition, LID, 2); + sycl::shift_group_right(ChunkGroup, LID, 2); ShiftRightAcc[WI] = (LID < 2 || ShiftRightResult == LID - 2); uint32_t SelectResult = sycl::select_from_group( - Partition, OriginalLID, - (Partition.get_local_id() + 2) % PartitionSize); + ChunkGroup, OriginalLID, + (ChunkGroup.get_local_id() + 2) % PartitionSize); SelectAcc[WI] = SelectResult == OriginalLID - LID + ((LID + 2) % PartitionSize); uint32_t Mask = PartitionSize <= 2 ? 0 : 2; uint32_t PermuteXorResult = - sycl::permute_group_by_xor(Partition, LID, Mask); + sycl::permute_group_by_xor(ChunkGroup, LID, Mask); PermuteXorAcc[WI] = (PermuteXorResult == (LID ^ Mask)); }; CGH.parallel_for>(NDR, KernelFunc); diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp b/sycl/test-e2e/NonUniformGroups/fragment.cpp similarity index 79% rename from sycl/test-e2e/NonUniformGroups/ballot_group.cpp rename to sycl/test-e2e/NonUniformGroups/fragment.cpp index e7cc44b0c1d8e..2df5abf5d834f 100644 --- a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/fragment.cpp @@ -10,7 +10,7 @@ // UNSUPPORTED: hip #include -#include +#include #include namespace syclex = sycl::ext::oneapi::experimental; @@ -19,6 +19,12 @@ class TestKernel; int main() { sycl::queue Q; + if (!Q.get_device().has(sycl::aspect::ext_oneapi_fragment)) { + std::cout << "Device has no support for ext_oneapi_fragment aspect..." + << std::endl; + return 0; + } + auto SGSizes = Q.get_device().get_info(); if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { std::cout << "Test skipped due to missing support for sub-group size 32." @@ -45,7 +51,7 @@ int main() { // Split into odd and even work-items. bool Predicate = WI % 2 == 0; - auto BallotGroup = syclex::get_ballot_group(SG, Predicate); + auto FragmentGroup = syclex::binary_partition(SG, Predicate); // Check function return values match Predicate. // NB: Test currently uses exactly one sub-group, but we use SG @@ -53,13 +59,13 @@ int main() { bool Match = true; auto GroupID = (Predicate) ? 1 : 0; auto LocalID = SG.get_local_id() / 2; - Match &= (BallotGroup.get_group_id() == GroupID); - Match &= (BallotGroup.get_local_id() == LocalID); - Match &= (BallotGroup.get_group_range() == 2); - Match &= (BallotGroup.get_local_range() == + Match &= (FragmentGroup.get_group_id() == GroupID); + Match &= (FragmentGroup.get_local_id() == LocalID); + Match &= (FragmentGroup.get_group_range() == 2); + Match &= (FragmentGroup.get_local_range() == SG.get_local_linear_range() / 2); MatchAcc[WI] = Match; - LeaderAcc[WI] = BallotGroup.leader(); + LeaderAcc[WI] = FragmentGroup.leader(); }; CGH.parallel_for(NDR, KernelFunc); }); diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/fragment_algorithms.cpp similarity index 97% rename from sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp rename to sycl/test-e2e/NonUniformGroups/fragment_algorithms.cpp index 55040474f35c1..f3cbb03d7acae 100644 --- a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/fragment_algorithms.cpp @@ -8,7 +8,7 @@ // // REQUIRES: cpu || gpu // REQUIRES: sg-32 -// REQUIRES: aspect-ext_oneapi_ballot_group +// REQUIRES: aspect-ext_oneapi_fragment // Fails in Nightly testing on the self-hosted CUDA runner: // UNSUPPORTED: cuda @@ -19,7 +19,7 @@ // The test is disabled for spirv-backend while we investigate the root cause. #include -#include +#include #include #include #include @@ -66,7 +66,7 @@ int main() { // Split into odd and even work-items. bool Predicate = WI % 2 == 0; - auto BallotGroup = syclex::get_ballot_group(SG, Predicate); + auto BallotGroup = syclex::binary_partition(SG, Predicate); uint32_t BallotGroupSize = BallotGroup.get_local_linear_range(); // Check all other members' writes are visible after a barrier. diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic.cpp similarity index 73% rename from sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp rename to sycl/test-e2e/NonUniformGroups/opportunistic.cpp index f371022b2df7d..ed38b89ad2312 100644 --- a/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/opportunistic.cpp @@ -1,7 +1,7 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out // -// CPU AOT targets host isa, so we compile on the run system instead. +// CPU AOT targets host isa - compile on run system instead // REQUIRES: opencl-aot // RUN: %if any-device-is-cpu && opencl-aot %{ %{run-aux} %clangxx -fsycl -fsycl-targets=spir64_x86_64 -o %t.x86.out %s %} // RUN: %if cpu %{ %{run} %t.x86.out %} @@ -10,7 +10,7 @@ // UNSUPPORTED: hip #include -#include +#include #include namespace syclex = sycl::ext::oneapi::experimental; @@ -19,6 +19,12 @@ class TestKernel; int main() { sycl::queue Q; + if (!Q.get_device().has(sycl::aspect::ext_oneapi_fragment)) { + std::cout << "Device has no support for ext_oneapi_fragment aspect..." + << std::endl; + return 0; + } + auto SGSizes = Q.get_device().get_info(); if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { std::cout << "Test skipped due to missing support for sub-group size 32." @@ -43,12 +49,17 @@ int main() { auto WI = item.get_global_id(); auto SG = item.get_sub_group(); - // Due to the unpredictable runtime behavior of opportunistic - // groups, some values may change from run to run. Check they're in - // expected ranges and consistent with other groups. + // opportunistic has unpredictable runtime behavior due to: + // - dynamic formation from work-items + // - hardware scheduling + // - runtime factors that affect grouping: + // - platform differences (SPIR-V vs. NVIDIA - group_ballot vs + // activemask) groups + // so need to check some values may change from run to run + // for expected ranges and consistent with other groups. if (item.get_global_id() % 2 == 0) { auto OpportunisticGroup = - syclex::this_kernel::get_opportunistic_group(); + syclex::this_work_item::get_opportunistic_group(); bool Match = true; Match &= (OpportunisticGroup.get_group_id() == 0); @@ -67,12 +78,11 @@ int main() { sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; uint32_t NumLeaders = 0; - for (int WI = 0; WI < WGS; ++WI) { + for (size_t WI{}; WI != WGS; ++WI) { if (WI % 2 == 0) { assert(MatchAcc[WI] == true); - if (LeaderAcc[WI]) { + if (LeaderAcc[WI]) NumLeaders++; - } } } assert(NumLeaders > 0); diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_algorithms.cpp similarity index 97% rename from sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp rename to sycl/test-e2e/NonUniformGroups/opportunistic_algorithms.cpp index 6b89c7f30447d..5cc65cde17797 100644 --- a/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/opportunistic_algorithms.cpp @@ -8,10 +8,10 @@ // // REQUIRES: cpu || gpu // REQUIRES: sg-32 -// REQUIRES: aspect-ext_oneapi_opportunistic_group +// REQUIRES: aspect-ext_oneapi_fragment #include -#include +#include #include #include #include @@ -68,7 +68,7 @@ int main() { // arbitrary group membership. if (OriginalLID == ArbitraryItem) { auto OpportunisticGroup = - syclex::this_kernel::get_opportunistic_group(); + syclex::this_work_item::get_opportunistic_group(); // This is trivial, but does test that group_barrier can be called. TmpAcc[WI] = 1; diff --git a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp b/sycl/test-e2e/NonUniformGroups/tangle.cpp similarity index 90% rename from sycl/test-e2e/NonUniformGroups/tangle_group.cpp rename to sycl/test-e2e/NonUniformGroups/tangle.cpp index 920f409c5eacf..a6ead14344ba9 100644 --- a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/tangle.cpp @@ -10,7 +10,7 @@ // UNSUPPORTED: target-nvidia || target-amd #include -#include +#include #include namespace syclex = sycl::ext::oneapi::experimental; @@ -19,6 +19,12 @@ class TestKernel; int main() { sycl::queue Q; + if (!Q.get_device().has(sycl::aspect::ext_oneapi_tangle)) { + std::cout << "Device has no support for ext_oneapi_tangle aspect..." + << std::endl; + return 0; + } + auto SGSizes = Q.get_device().get_info(); if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { std::cout << "Test skipped due to missing support for sub-group size 32." @@ -47,7 +53,7 @@ int main() { // Branches deliberately duplicated to test impact of optimizations. // This only reliably works with optimizations disabled right now. if (item.get_global_id() % 2 == 0) { - auto TangleGroup = syclex::get_tangle_group(SG); + auto TangleGroup = syclex::entangle(SG); bool Match = true; Match &= (TangleGroup.get_group_id() == 0); @@ -58,7 +64,7 @@ int main() { MatchAcc[WI] = Match; LeaderAcc[WI] = TangleGroup.leader(); } else { - auto TangleGroup = syclex::get_tangle_group(SG); + auto TangleGroup = syclex::entangle(SG); bool Match = true; Match &= (TangleGroup.get_group_id() == 0); diff --git a/sycl/test-e2e/NonUniformGroups/tangle_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp similarity index 96% rename from sycl/test-e2e/NonUniformGroups/tangle_group_algorithms.cpp rename to sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp index f160e64c09125..117c45ba34282 100644 --- a/sycl/test-e2e/NonUniformGroups/tangle_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp @@ -8,7 +8,7 @@ // // REQUIRES: cpu || gpu // REQUIRES: sg-32 -// REQUIRES: aspect-ext_oneapi_tangle_group +// REQUIRES: aspect-ext_oneapi_tangle // UNSUPPORTED: target-amd // UNSUPPORTED-INTENDED: tangle groups not available on amd // UNSUPPORTED: target-nvidia || windows @@ -16,7 +16,7 @@ // The test is disabled while we investigate the root cause. #include -#include +#include #include #include #include @@ -124,13 +124,13 @@ int main() { // Split into three groups of different sizes, using control flow // Body of each branch is deliberately duplicated if (WI < 4) { - auto Tangle = syclex::get_tangle_group(SG); + auto Tangle = syclex::entangle(SG); size_t TangleLeader = 0; size_t TangleSize = 4; auto IsMember = [](size_t Other) { return (Other < 4); }; BranchBody(WI, Tangle, TangleLeader, TangleSize, IsMember); } else if (WI < 24) { - auto Tangle = syclex::get_tangle_group(SG); + auto Tangle = syclex::entangle(SG); size_t TangleLeader = 4; size_t TangleSize = 20; auto IsMember = [](size_t Other) { @@ -138,7 +138,7 @@ int main() { }; BranchBody(WI, Tangle, TangleLeader, TangleSize, IsMember); } else /* if WI < 32) */ { - auto Tangle = syclex::get_tangle_group(SG); + auto Tangle = syclex::entangle(SG); size_t TangleLeader = 24; size_t TangleSize = 8; auto IsMember = [](size_t Other) { diff --git a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp index 878c26c8f26d5..6c9a65b818677 100644 --- a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp +++ b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp @@ -208,11 +208,11 @@ // CHECK-NEXT: Matrix/joint_matrix_bf16_fill_k_cache_prefetch.cpp // CHECK-NEXT: Matrix/joint_matrix_down_convert.cpp // CHECK-NEXT: Matrix/joint_matrix_rowmajorA_rowmajorB.cpp -// CHECK-NEXT: NonUniformGroups/ballot_group.cpp -// CHECK-NEXT: NonUniformGroups/fixed_size_group.cpp -// CHECK-NEXT: NonUniformGroups/opportunistic_group.cpp -// CHECK-NEXT: NonUniformGroups/tangle_group.cpp -// CHECK-NEXT: NonUniformGroups/tangle_group_algorithms.cpp +// CHECK-NEXT: NonUniformGroups/chunk.cpp +// CHECK-NEXT: NonUniformGroups/fragment.cpp +// CHECK-NEXT: NonUniformGroups/opportunistic.cpp +// CHECK-NEXT: NonUniformGroups/tangle.cpp +// CHECK-NEXT: NonUniformGroups/tangle_algorithms.cpp // CHECK-NEXT: OptionalKernelFeatures/large-reqd-work-group-size.cpp // CHECK-NEXT: OptionalKernelFeatures/no-fp64-optimization-declared-aspects.cpp // CHECK-NEXT: Printf/char.cpp diff --git a/sycl/test/non-uniform-groups/is_user_constructed.cpp b/sycl/test/non-uniform-groups/is_user_constructed.cpp index d5b7df40662fb..a694c4ba49eca 100644 --- a/sycl/test/non-uniform-groups/is_user_constructed.cpp +++ b/sycl/test/non-uniform-groups/is_user_constructed.cpp @@ -1,17 +1,20 @@ // RUN: %clangxx -fsycl -fsyntax-only %s -#include -#include -#include -#include +#include +#include +#include namespace syclex = sycl::ext::oneapi::experimental; -static_assert( - syclex::is_user_constructed_group_v>); -static_assert(syclex::is_user_constructed_group_v< - syclex::fixed_size_group<1, sycl::sub_group>>); -static_assert(syclex::is_user_constructed_group_v< - syclex::fixed_size_group<2, sycl::sub_group>>); -static_assert( - syclex::is_user_constructed_group_v>); -static_assert(syclex::is_user_constructed_group_v); +template +inline constexpr bool is_user_constructed_group = syclex::is_user_constructed_group_v; + +// is recognized as user-constructed +static_assert(is_user_constructed_group>); +static_assert(is_user_constructed_group>); +static_assert(is_user_constructed_group>); +static_assert(is_user_constructed_group>); +static_assert(is_user_constructed_group>); +static_assert(is_user_constructed_group>); + +// sub_group itself is NOT user-constructed +static_assert(not is_user_constructed_group); \ No newline at end of file diff --git a/sycl/test/non-uniform-groups/type_traits.cpp b/sycl/test/non-uniform-groups/type_traits.cpp new file mode 100644 index 0000000000000..4cebc565cf0e1 --- /dev/null +++ b/sycl/test/non-uniform-groups/type_traits.cpp @@ -0,0 +1,31 @@ +// RUN: %clangxx -fsycl -fsyntax-only %s + +#include +#include +#include + +namespace syclex = sycl::ext::oneapi::experimental; + +// check each trait correctly identifies own type +static_assert(syclex::is_chunk_v>); +static_assert(syclex::is_fragment_v>); +static_assert(syclex::is_tangle_v>); + +// check traits return false for different group types (cross-check) +static_assert(!syclex::is_chunk_v>); +static_assert(!syclex::is_chunk_v>); +static_assert(!syclex::is_fragment_v>); +static_assert(!syclex::is_fragment_v>); +static_assert(!syclex::is_tangle_v>); +static_assert(!syclex::is_tangle_v>); + +// check traits return false for base group types +static_assert(!syclex::is_chunk_v); +static_assert(!syclex::is_fragment_v); +static_assert(!syclex::is_tangle_v); + +// chunk sizes +static_assert(syclex::is_chunk_v>); +static_assert(syclex::is_chunk_v>); + +// these traits are used in spirv.hpp MapShuffleID() for dispatch diff --git a/sycl/test/regression/group_algorithms.cpp b/sycl/test/regression/group_algorithms.cpp index 0d9cecc0192db..76677ac743b94 100644 --- a/sycl/test/regression/group_algorithms.cpp +++ b/sycl/test/regression/group_algorithms.cpp @@ -1,6 +1,9 @@ // RUN: %clangxx -fsycl -fsyntax-only %s // RUN: %if preview-breaking-changes-supported %{ %clangxx -fsycl -fsyntax-only -fpreview-breaking-changes %s %} +#include +#include +#include #include #include @@ -153,10 +156,11 @@ int main() { sycl::sub_group SG = NDI.get_sub_group(); TestForGroup(SG); TestForGroup(NDI.get_group()); - TestForGroup(syclex::get_ballot_group(SG, true)); - TestForGroup(syclex::get_fixed_size_group<8>(SG)); - TestForGroup(syclex::get_tangle_group(SG)); - TestForGroup(syclex::this_kernel::get_opportunistic_group()); + TestForGroup(syclex::binary_partition(SG, true)); + TestForGroup(syclex::chunked_partition<8>(SG)); + TestForGroup(syclex::entangle(SG)); + TestForGroup( + syclex::this_work_item::get_opportunistic_group()); }); return 0; } diff --git a/sycl/unittests/context_device/CMakeLists.txt b/sycl/unittests/context_device/CMakeLists.txt index 02488fae878bd..ff435ec68a8d7 100644 --- a/sycl/unittests/context_device/CMakeLists.txt +++ b/sycl/unittests/context_device/CMakeLists.txt @@ -1,4 +1,5 @@ add_sycl_unittest(ContextDeviceTests OBJECT Context.cpp DeviceRefCounter.cpp + HasExtensionWordBoundary.cpp ) diff --git a/sycl/unittests/context_device/HasExtensionWordBoundary.cpp b/sycl/unittests/context_device/HasExtensionWordBoundary.cpp new file mode 100644 index 0000000000000..bad3ebfd47cd8 --- /dev/null +++ b/sycl/unittests/context_device/HasExtensionWordBoundary.cpp @@ -0,0 +1,125 @@ +//==---- HasExtensionWordBoundary.cpp --- Test word boundary fix ----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This test verifies that has_extension correctly matches full extension names +// and doesn't match partial substrings. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +using namespace sycl; + +static std::string MockExtensions = ""; + +static ur_result_t redefinedDeviceGetInfo(void *pParams) { + auto params = *static_cast(pParams); + + if (*params.ppropName == UR_DEVICE_INFO_EXTENSIONS) { + // override extensions query with mock data + if (*params.ppPropValue) { + size_t len = MockExtensions.length() + 1; + if (*params.ppropSize >= len) + std::memcpy(*params.ppPropValue, MockExtensions.c_str(), len); + } + if (*params.ppPropSizeRet) + **params.ppPropSizeRet = MockExtensions.length() + 1; + + return UR_RESULT_SUCCESS; + } + + // delegate to the default mock + return sycl::unittest::MockAdapter::mock_urDeviceGetInfo(pParams); +} + +class HasExtensionWordBoundaryTest : public ::testing::Test { +public: + HasExtensionWordBoundaryTest() : Mock{}, Plt{sycl::platform()} {} + +protected: + void SetUp() override { + mock::getCallbacks().set_replace_callback("urDeviceGetInfo", + &redefinedDeviceGetInfo); + } + + void TearDown() override { + // nullptr to use default mock + mock::getCallbacks().set_replace_callback("urDeviceGetInfo", nullptr); + } + + sycl::unittest::UrMock<> Mock; + sycl::platform Plt; +}; + +TEST_F(HasExtensionWordBoundaryTest, ExactMatchWorks) { + MockExtensions = "cl_khr_fp64 cl_intel_subgroups cl_khr_subgroups"; + Plt = sycl::platform(); + + sycl::device Dev = Plt.get_devices()[0]; + auto DevImpl = detail::getSyclObjImpl(Dev); + + EXPECT_TRUE(DevImpl->has_extension("cl_khr_fp64")); + EXPECT_TRUE(DevImpl->has_extension("cl_intel_subgroups")); + EXPECT_TRUE(DevImpl->has_extension("cl_khr_subgroups")); +} + +TEST_F(HasExtensionWordBoundaryTest, SubstringDoesNotMatch) { + MockExtensions = "cl_intel_subgroups cl_khr_fp64_extended"; + sycl::device Dev = Plt.get_devices()[0]; + auto DevImpl = detail::getSyclObjImpl(Dev); + + // These should NOT match because they're substrings + EXPECT_FALSE(DevImpl->has_extension("cl_intel_subgroup")); // Missing 's' + + // Would match in old implementation + EXPECT_FALSE(DevImpl->has_extension("cl_khr_fp64")); + EXPECT_FALSE(DevImpl->has_extension("subgroups")); // Partial match + EXPECT_FALSE(DevImpl->has_extension("intel_subgroups")); // Partial match +} + +TEST_F(HasExtensionWordBoundaryTest, EdgeCases) { + sycl::device Dev = Plt.get_devices()[0]; + auto DevImpl = detail::getSyclObjImpl(Dev); + MockExtensions = ""; + EXPECT_FALSE(DevImpl->has_extension("cl_khr_fp64")); + MockExtensions = "cl_khr_fp64"; + EXPECT_TRUE(DevImpl->has_extension("cl_khr_fp64")); + EXPECT_FALSE(DevImpl->has_extension("cl_khr_fp6")); // Substring + MockExtensions = "cl_first_ext cl_middle_ext cl_last_ext"; + EXPECT_TRUE(DevImpl->has_extension("cl_first_ext")); + EXPECT_TRUE(DevImpl->has_extension("cl_middle_ext")); + EXPECT_TRUE(DevImpl->has_extension("cl_last_ext")); +} + +TEST_F(HasExtensionWordBoundaryTest, NonUniformGroupExtensions) { + MockExtensions = "cl_khr_subgroup_non_uniform_vote " + "cl_khr_subgroup_ballot " + "cl_intel_subgroups " + "cl_intel_spirv_subgroups " + "cl_intel_subgroup_matrix_multiply_accumulate"; + sycl::device Dev = Plt.get_devices()[0]; + auto DevImpl = detail::getSyclObjImpl(Dev); + + // should match (real extensions) + EXPECT_TRUE(DevImpl->has_extension("cl_khr_subgroup_non_uniform_vote")); + EXPECT_TRUE(DevImpl->has_extension("cl_khr_subgroup_ballot")); + EXPECT_TRUE(DevImpl->has_extension("cl_intel_subgroups")); + EXPECT_TRUE(DevImpl->has_extension("cl_intel_spirv_subgroups")); + EXPECT_TRUE( + DevImpl->has_extension("cl_intel_subgroup_matrix_multiply_accumulate")); + + // next should NOT match (substrings that would match with old impl.) + EXPECT_FALSE(DevImpl->has_extension("cl_khr_subgroup")); + EXPECT_FALSE(DevImpl->has_extension("cl_intel_subgroup")); // Missing 's' + EXPECT_FALSE(DevImpl->has_extension("non_uniform_vote")); // Missing prefix + EXPECT_FALSE(DevImpl->has_extension("subgroup_matrix_multiply")); +} \ No newline at end of file From 244962d6c1e5a53ef21f14491cf06d74a125a154 Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Tue, 1 Jul 2025 15:25:45 +0200 Subject: [PATCH 02/32] [SYCL] Fixed missing code formatting --- sycl/include/sycl/ext/oneapi/experimental/tangle.hpp | 2 +- sycl/test/non-uniform-groups/is_user_constructed.cpp | 3 ++- sycl/test/non-uniform-groups/type_traits.cpp | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp index 726fba7b8bde9..8ad18dddb6e10 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp @@ -158,7 +158,7 @@ entangle(ParentGroup parent) { return tangle(mask); #elif defined(__NVPTX__) // TODO: CUDA devices will report false for the tangle - // support aspect so kernels launch should ensure this is never run. + // support aspect so kernels launch should ensure this is never run. return tangle(0); #endif #else diff --git a/sycl/test/non-uniform-groups/is_user_constructed.cpp b/sycl/test/non-uniform-groups/is_user_constructed.cpp index a694c4ba49eca..a3df70c4d18a8 100644 --- a/sycl/test/non-uniform-groups/is_user_constructed.cpp +++ b/sycl/test/non-uniform-groups/is_user_constructed.cpp @@ -6,7 +6,8 @@ namespace syclex = sycl::ext::oneapi::experimental; template -inline constexpr bool is_user_constructed_group = syclex::is_user_constructed_group_v; +inline constexpr bool is_user_constructed_group = + syclex::is_user_constructed_group_v; // is recognized as user-constructed static_assert(is_user_constructed_group>); diff --git a/sycl/test/non-uniform-groups/type_traits.cpp b/sycl/test/non-uniform-groups/type_traits.cpp index 4cebc565cf0e1..46c901d072432 100644 --- a/sycl/test/non-uniform-groups/type_traits.cpp +++ b/sycl/test/non-uniform-groups/type_traits.cpp @@ -1,7 +1,7 @@ // RUN: %clangxx -fsycl -fsyntax-only %s -#include #include +#include #include namespace syclex = sycl::ext::oneapi::experimental; From 4737295b417329035133c285bb55d05d5125474f Mon Sep 17 00:00:00 2001 From: AndreiZibrov Date: Tue, 1 Jul 2025 16:33:14 +0200 Subject: [PATCH 03/32] [SYCL ] Update AddSecurityFlags.cmake for wip --- llvm/cmake/modules/AddSecurityFlags.cmake | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/cmake/modules/AddSecurityFlags.cmake b/llvm/cmake/modules/AddSecurityFlags.cmake index 6e1e7058f1dd4..7f49778e0a5b1 100644 --- a/llvm/cmake/modules/AddSecurityFlags.cmake +++ b/llvm/cmake/modules/AddSecurityFlags.cmake @@ -168,18 +168,18 @@ macro(append_common_extra_security_flags) if(LLVM_ON_UNIX) # Fortify Source (strongly recommended): if(CMAKE_BUILD_TYPE STREQUAL "Debug") - message(WARNING "-D_FORTIFY_SOURCE=2 can only be used with optimization.") - message(WARNING "-D_FORTIFY_SOURCE=2 is not supported.") + message(WARNING "-D_FORTIFY_SOURCE=3 can only be used with optimization.") + message(WARNING "-D_FORTIFY_SOURCE=3 is not supported.") else() # Sanitizers do not work with checked memory functions, such as # __memset_chk. We do not build release packages with sanitizers, so just - # avoid -D_FORTIFY_SOURCE=2 under LLVM_USE_SANITIZER. + # avoid -D_FORTIFY_SOURCE=3 under LLVM_USE_SANITIZER. if(NOT LLVM_USE_SANITIZER) - message(STATUS "Building with -D_FORTIFY_SOURCE=2") - add_definitions(-D_FORTIFY_SOURCE=2) + message(STATUS "Building with -D_FORTIFY_SOURCE=3") + add_definitions(-D_FORTIFY_SOURCE=3) else() message( - WARNING "-D_FORTIFY_SOURCE=2 dropped due to LLVM_USE_SANITIZER.") + WARNING "-D_FORTIFY_SOURCE=3 dropped due to LLVM_USE_SANITIZER.") endif() endif() From 1e1a6e7677ec240fd769aa4217b0f6416bec6b82 Mon Sep 17 00:00:00 2001 From: AndreiZibrov Date: Tue, 1 Jul 2025 17:16:51 +0200 Subject: [PATCH 04/32] Update sycl/include/sycl/detail/spirv.hpp Co-authored-by: John Pennycook --- sycl/include/sycl/detail/spirv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 163334e5a7969..45157438a869e 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -206,7 +206,7 @@ bool GroupAll(ext::oneapi::experimental::tangle, bool pred) { template bool GroupAll(ext::oneapi::experimental::chunk, bool pred) { - // Using reduction becaue the GroupNonUniformAll have no support of cluster + // Using reduction because the GroupNonUniformAll have no support of cluster // size return __spirv_GroupNonUniformBitwiseAnd( group_scope::value, From f2a2bc5dbb161a23289d527b6c089cb456076e31 Mon Sep 17 00:00:00 2001 From: AndreiZibrov Date: Tue, 1 Jul 2025 17:17:31 +0200 Subject: [PATCH 05/32] Update sycl/include/sycl/ext/oneapi/experimental/fragment.hpp Co-authored-by: John Pennycook --- sycl/include/sycl/ext/oneapi/experimental/fragment.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp b/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp index a89acdbabe7c2..a5c082c5d124a 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp @@ -170,7 +170,7 @@ binary_partition(ParentGroup parent, bool predicate) { #if defined(__SPIR__) || defined(__SPIRV__) || defined(__NVPTX__) sub_group_mask mask = sycl::ext::oneapi::group_ballot(parent, predicate); id<1> group_id = predicate ? 1 : 0; - range<1> group_range = 2; // 2 groupds based on predicate by binary_partition + range<1> group_range = 2; // 2 groups based on predicate by binary_partition if (!predicate) { sub_group_mask::BitsType participant_filter = From 0b8eba3df69ed9a8aa5d68b86955f2e47a3fb948 Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Wed, 2 Jul 2025 06:51:21 +0200 Subject: [PATCH 06/32] [SYCL] Fix review findings --- sycl/include/sycl/detail/spirv.hpp | 69 ++++++++++++++----- .../sycl/ext/oneapi/experimental/tangle.hpp | 7 +- sycl/include/sycl/info/aspects.def | 60 ++++++++-------- sycl/source/detail/device_impl.hpp | 20 ++---- sycl/test-e2e/Assert/check_resource_leak.cpp | 1 - sycl/test-e2e/NonUniformGroups/chunk.cpp | 1 - sycl/test-e2e/NonUniformGroups/fragment.cpp | 1 - .../NonUniformGroups/opportunistic.cpp | 1 - 8 files changed, 92 insertions(+), 68 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 45157438a869e..da2323d56c519 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -23,7 +23,8 @@ #include #endif -#include // sycl::detail::memcpy +#include // sycl::detail::memcpy +#include // is_fixed_size_group traits namespace sycl { inline namespace _V1 { @@ -38,7 +39,6 @@ template class root_group; template class tangle; template class fragment; template class chunk; -// opportunistic_group merged into fragment } // namespace experimental } // namespace oneapi } // namespace ext @@ -74,15 +74,11 @@ struct is_tangle_group> template struct is_ballot_group : std::false_type {}; -template -struct is_ballot_group> - : std::true_type {}; - -template struct is_fixed_size_group : std::false_type {}; - template -struct is_fixed_size_group> : std::true_type {}; +struct detail::is_fixed_size_group< + sycl::ext::oneapi::experimental::fixed_size_group> + : std::true_type {}; template struct is_fragment : std::false_type {}; @@ -187,6 +183,16 @@ bool GroupAll(ext::oneapi::experimental::fragment g, bool pred) { } else { return __spirv_GroupNonUniformAll(group_scope::value, pred); } + // TODO: adding support for fragments have partitioning into more than two + // groups such as labeled_partition: + // + // 1. add size_t framgment::get_group_count() const definition + // 2. const auto group_count{g.get_group_count()}; + // 3. for (size_t i{}; i != group_count; ++i) + // if (g.get_group_id() == i) + // return __spirv_GroupNonUniformAll( + // group_scope::value, + // pred); } template bool GroupAll( @@ -227,6 +233,16 @@ bool GroupAny(ext::oneapi::experimental::fragment g, bool pred) { } else { return __spirv_GroupNonUniformAny(group_scope::value, pred); } + // TODO: adding support for fragments have partitioning into more than two + // groups such as labeled_partition: + // + // 1. add size_t framgment::get_group_count() const definition + // 2. const auto group_count{g.get_group_count()}; + // 3. for (size_t i{}; i != group_count; ++i) + // if (g.get_group_id() == i) + // return __spirv_GroupNonUniformAny( + // group_scope::value, + // pred); } template bool GroupAny( @@ -246,7 +262,7 @@ bool GroupAny(ext::oneapi::experimental::tangle, bool pred) { template bool GroupAny(ext::oneapi::experimental::chunk, bool pred) { - // Using reduction becaue the GroupNonUniformAll have no support of cluster + // Using reduction because the GroupNonUniformAll have no support of cluster // size return __spirv_GroupNonUniformBitwiseOr( group_scope::value, @@ -254,8 +270,6 @@ bool GroupAny(ext::oneapi::experimental::chunk, static_cast(pred), ChunkSize); } -// opportunistic_group merged into fragment - // Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic // FIXME: Do not special-case for half or vec once all backends support all data // types. @@ -352,6 +366,17 @@ GroupBroadcast(sycl::ext::oneapi::experimental::fragment g, T x, return __spirv_GroupNonUniformBroadcast(group_scope::value, WideOCLX, OCLId); } + // TODO: adding support for fragments have partitioning into more than two + // groups such as labeled_partition: + // + // 1. add size_t framgment::get_group_count() const definition + // 2. const auto group_count{g.get_group_count()}; + // 3. for (size_t i{}; i != group_count; ++i) + // if (g.get_group_id() == i) + // return __spirv_GroupNonUniformBroadcast( + // group_scope::value, + // WideOCLX, + // OCLId); } template EnableIfNativeBroadcast GroupBroadcast( @@ -931,7 +956,7 @@ inline uint32_t MapShuffleID(GroupT g, id<1> local_id) { if constexpr (is_tangle_group::value || is_ballot_group::value || is_fragment::value) return detail::IdToMaskPosition(g, local_id); - else if constexpr (is_fixed_size_group::value || + else if constexpr (detail::is_fixed_size_group::value || is_chunk::value) return g.get_group_linear_id() * g.get_local_range().size() + local_id; else @@ -1028,10 +1053,9 @@ EnableIfNativeShuffle ShuffleXor(GroupT g, T x, id<1> mask) { if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< GroupT>) { auto MemberMask = detail::ExtractMask(detail::GetMask(g))[0]; - if constexpr (is_fixed_size_group_v) { + if constexpr (detail::is_fixed_size_group_v) { return cuda_shfl_sync_bfly_i32(MemberMask, x, static_cast(mask.get(0)), 0x1f); - } else { int unfoldedSrcSetBit = (g.get_local_id()[0] ^ static_cast(mask.get(0))) + 1; @@ -1076,7 +1100,7 @@ EnableIfNativeShuffle ShuffleDown(GroupT g, T x, uint32_t delta) { if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< GroupT>) { auto MemberMask = detail::ExtractMask(detail::GetMask(g))[0]; - if constexpr (is_fixed_size_group_v) { + if constexpr (detail::is_fixed_size_group_v) { return cuda_shfl_sync_down_i32(MemberMask, x, delta, 31); } else { unsigned localSetBit = g.get_local_id()[0] + 1; @@ -1120,7 +1144,7 @@ EnableIfNativeShuffle ShuffleUp(GroupT g, T x, uint32_t delta) { if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< GroupT>) { auto MemberMask = detail::ExtractMask(detail::GetMask(g))[0]; - if constexpr (is_fixed_size_group_v) { + if constexpr (detail::is_fixed_size_group_v) { return cuda_shfl_sync_up_i32(MemberMask, x, delta, 0); } else { unsigned localSetBit = g.get_local_id()[0] + 1; @@ -1341,6 +1365,15 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { } else { \ return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \ } \ + // clang-format off \ + /* TODO: add support for partitioning into more than two groups \ + rewrite as a loop over g.get_group_count(). \ + for (size_t i{}; i != group_count; ++i) \ + if (g.get_group_id() == i) \ + return __spirv_GroupNonUniform##Instruction( \ + group_scope::value, \ + OpInt, Arg);*/ \ + // clang-format on \ } \ \ template <__spv::GroupOperation Op, size_t PartitionSize, \ diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp index 8ad18dddb6e10..5be649f985c8e 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp @@ -153,7 +153,12 @@ entangle(ParentGroup parent) { sycl::group_barrier(parent); #if defined(__SPIR__) || defined(__SPIRV__) - // mask is required to calculate IDs (not the group). + // All SPIR-V devices that we currently target execute in SIMD fashion, + // and so the group of work-items in converged control flow is implicit. + // We store the mask here because it is required to calculate IDs, not + // because it is required to construct the group. + + // mask is required to calculate IDs (not the group) sub_group_mask mask = sycl::ext::oneapi::group_ballot(parent, true); return tangle(mask); #elif defined(__NVPTX__) diff --git a/sycl/include/sycl/info/aspects.def b/sycl/include/sycl/info/aspects.def index 246bd7766e0d4..999db478eb075 100644 --- a/sycl/include/sycl/info/aspects.def +++ b/sycl/include/sycl/info/aspects.def @@ -47,34 +47,34 @@ __SYCL_ASPECT(ext_oneapi_mipmap_level_reference, 52) __SYCL_ASPECT(ext_intel_esimd, 53) __SYCL_ASPECT(ext_oneapi_fragment, 54) __SYCL_ASPECT(ext_oneapi_chunk, 55) -__SYCL_ASPECT(ext_oneapi_tangle, 56) -__SYCL_ASPECT(ext_intel_matrix, 57) -__SYCL_ASPECT(ext_oneapi_is_composite, 58) -__SYCL_ASPECT(ext_oneapi_is_component, 59) -__SYCL_ASPECT(ext_oneapi_graph, 60) -__SYCL_ASPECT(ext_intel_fpga_task_sequence, 61) -__SYCL_ASPECT(ext_oneapi_limited_graph, 62) -__SYCL_ASPECT(ext_oneapi_private_alloca, 63) -__SYCL_ASPECT(ext_oneapi_cubemap, 64) -__SYCL_ASPECT(ext_oneapi_cubemap_seamless_filtering, 65) -__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_1d_usm, 66) -__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_1d, 67) -__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_2d_usm, 68) -__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_2d, 69) -__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_3d, 70) -__SYCL_ASPECT(ext_oneapi_queue_profiling_tag, 71) -__SYCL_ASPECT(ext_oneapi_virtual_mem, 72) -__SYCL_ASPECT(ext_oneapi_cuda_cluster_group, 73) -__SYCL_ASPECT(ext_oneapi_image_array, 74) -__SYCL_ASPECT(ext_oneapi_unique_addressing_per_dim, 75) -__SYCL_ASPECT(ext_oneapi_bindless_images_sample_1d_usm, 76) -__SYCL_ASPECT(ext_oneapi_bindless_images_sample_2d_usm, 77) -__SYCL_ASPECT(ext_oneapi_atomic16, 78) -__SYCL_ASPECT(ext_oneapi_virtual_functions, 79) -__SYCL_ASPECT(ext_intel_spill_memory_size, 80) -__SYCL_ASPECT(ext_oneapi_bindless_images_gather, 81) -__SYCL_ASPECT(ext_intel_current_clock_throttle_reasons, 82) -__SYCL_ASPECT(ext_intel_fan_speed, 83) -__SYCL_ASPECT(ext_intel_power_limits, 84) -__SYCL_ASPECT(ext_oneapi_async_memory_alloc, 85) +__SYCL_ASPECT(ext_oneapi_tangle, 57) +__SYCL_ASPECT(ext_intel_matrix, 58) +__SYCL_ASPECT(ext_oneapi_is_composite, 59) +__SYCL_ASPECT(ext_oneapi_is_component, 60) +__SYCL_ASPECT(ext_oneapi_graph, 61) +__SYCL_ASPECT(ext_intel_fpga_task_sequence, 62) +__SYCL_ASPECT(ext_oneapi_limited_graph, 63) +__SYCL_ASPECT(ext_oneapi_private_alloca, 64) +__SYCL_ASPECT(ext_oneapi_cubemap, 65) +__SYCL_ASPECT(ext_oneapi_cubemap_seamless_filtering, 66) +__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_1d_usm, 67) +__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_1d, 68) +__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_2d_usm, 69) +__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_2d, 70) +__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_3d, 72) +__SYCL_ASPECT(ext_oneapi_queue_profiling_tag, 73) +__SYCL_ASPECT(ext_oneapi_virtual_mem, 74) +__SYCL_ASPECT(ext_oneapi_cuda_cluster_group, 75) +__SYCL_ASPECT(ext_oneapi_image_array, 76) +__SYCL_ASPECT(ext_oneapi_unique_addressing_per_dim, 77) +__SYCL_ASPECT(ext_oneapi_bindless_images_sample_1d_usm, 78) +__SYCL_ASPECT(ext_oneapi_bindless_images_sample_2d_usm, 79) +__SYCL_ASPECT(ext_oneapi_atomic16, 80) +__SYCL_ASPECT(ext_oneapi_virtual_functions, 81) +__SYCL_ASPECT(ext_intel_spill_memory_size, 82) +__SYCL_ASPECT(ext_oneapi_bindless_images_gather, 83) +__SYCL_ASPECT(ext_intel_current_clock_throttle_reasons, 84) +__SYCL_ASPECT(ext_intel_fan_speed, 85) +__SYCL_ASPECT(ext_intel_power_limits, 86) +__SYCL_ASPECT(ext_oneapi_async_memory_alloc, 87) diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp index fa22c8e5bf4ff..565cbb150f5e4 100644 --- a/sycl/source/detail/device_impl.hpp +++ b/sycl/source/detail/device_impl.hpp @@ -1434,29 +1434,19 @@ class device_impl : public std::enable_shared_from_this { return get_info_impl_nocheck().value_or(0); } CASE(ext_oneapi_fragment) { - // check actual OpenCL extensions: - if (this->getBackend() == backend::opencl) - return has_extension("cl_khr_subgroup_non_uniform_vote") || - has_extension("cl_khr_subgroup_ballot") || - has_extension("cl_intel_spirv_subgroups"); - // TODO: add corrcet checks for other backends return (this->getBackend() == backend::ext_oneapi_level_zero) || + (this->getBackend() == backend::opencl) || (this->getBackend() == backend::ext_oneapi_cuda); } CASE(ext_oneapi_chunk) { - // Check for Intel subgroups extension that provides block read/write - if (this->getBackend() == backend::opencl) - return has_extension("cl_intel_subgroups"); - // TODO: add corrcet checks for other backends return (this->getBackend() == backend::ext_oneapi_level_zero) || + (this->getBackend() == backend::opencl) || (this->getBackend() == backend::ext_oneapi_cuda); } CASE(ext_oneapi_tangle) { - // note: typically is available on newer GPUs - not on CPUs - if (this->getBackend() == backend::opencl) - return has_extension("cl_intel_subgroup_matrix_multiply_accumulate"); - // TODO: add proper checks for other backends - return (this->getBackend() == backend::ext_oneapi_level_zero); + return (this->getBackend() == backend::ext_oneapi_level_zero) || + (this->getBackend() == backend::opencl) || + (this->getBackend() == backend::ext_oneapi_cuda); } CASE(ext_intel_matrix) { using arch = sycl::ext::oneapi::experimental::architecture; diff --git a/sycl/test-e2e/Assert/check_resource_leak.cpp b/sycl/test-e2e/Assert/check_resource_leak.cpp index 791b8c6d39c61..2a5fc00f2295a 100644 --- a/sycl/test-e2e/Assert/check_resource_leak.cpp +++ b/sycl/test-e2e/Assert/check_resource_leak.cpp @@ -12,7 +12,6 @@ #include #include -#include #include #include diff --git a/sycl/test-e2e/NonUniformGroups/chunk.cpp b/sycl/test-e2e/NonUniformGroups/chunk.cpp index 25bef36cbfcdc..e74b49c9d0c3c 100644 --- a/sycl/test-e2e/NonUniformGroups/chunk.cpp +++ b/sycl/test-e2e/NonUniformGroups/chunk.cpp @@ -7,7 +7,6 @@ // RUN: %if cpu %{ %{run} %t.x86.out %} // // REQUIRES: cpu || gpu -// UNSUPPORTED: hip // REQUIRES: sg-32 #include diff --git a/sycl/test-e2e/NonUniformGroups/fragment.cpp b/sycl/test-e2e/NonUniformGroups/fragment.cpp index 2df5abf5d834f..2117507f87af7 100644 --- a/sycl/test-e2e/NonUniformGroups/fragment.cpp +++ b/sycl/test-e2e/NonUniformGroups/fragment.cpp @@ -7,7 +7,6 @@ // RUN: %if cpu %{ %{run} %t.x86.out %} // // REQUIRES: cpu || gpu -// UNSUPPORTED: hip #include #include diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic.cpp index ed38b89ad2312..d0133a0d9cb96 100644 --- a/sycl/test-e2e/NonUniformGroups/opportunistic.cpp +++ b/sycl/test-e2e/NonUniformGroups/opportunistic.cpp @@ -7,7 +7,6 @@ // RUN: %if cpu %{ %{run} %t.x86.out %} // // REQUIRES: cpu || gpu -// UNSUPPORTED: hip #include #include From 88d35d93f4826406fe11fd24db67de1e764fd3e4 Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Wed, 2 Jul 2025 07:15:07 +0200 Subject: [PATCH 07/32] [SYCL] getting rid of throwing --- .../sycl/ext/oneapi/experimental/chunk.hpp | 33 +++++++------------ .../sycl/ext/oneapi/experimental/fragment.hpp | 33 +++++++------------ .../sycl/ext/oneapi/experimental/tangle.hpp | 30 ++++++----------- sycl/test-e2e/NonUniformGroups/chunk.cpp | 9 ++--- sycl/test-e2e/NonUniformGroups/fragment.cpp | 7 +--- .../NonUniformGroups/opportunistic.cpp | 7 +--- sycl/test-e2e/NonUniformGroups/tangle.cpp | 7 +--- 7 files changed, 37 insertions(+), 89 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp b/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp index f94b7bec655e4..46274ad33d8e9 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp @@ -53,8 +53,7 @@ template class chunk { #ifdef __SYCL_DEVICE_ONLY__ return __spirv_SubgroupLocalInvocationId() / ChunkSize; #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return id_type(0); #endif } @@ -62,8 +61,7 @@ template class chunk { #ifdef __SYCL_DEVICE_ONLY__ return __spirv_SubgroupLocalInvocationId() % ChunkSize; #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return id_type(0); #endif } @@ -71,8 +69,7 @@ template class chunk { #ifdef __SYCL_DEVICE_ONLY__ return __spirv_SubgroupSize() / ChunkSize; #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return range_type(0); #endif } @@ -80,8 +77,7 @@ template class chunk { #ifdef __SYCL_DEVICE_ONLY__ return ChunkSize; #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return range_type(0); #endif } @@ -89,8 +85,7 @@ template class chunk { #ifdef __SYCL_DEVICE_ONLY__ return static_cast(get_group_id()[0]); #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return linear_id_type(0); #endif } @@ -98,8 +93,7 @@ template class chunk { #ifdef __SYCL_DEVICE_ONLY__ return static_cast(get_local_id()[0]); #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return linear_id_type(0); #endif } @@ -107,8 +101,7 @@ template class chunk { #ifdef __SYCL_DEVICE_ONLY__ return static_cast(get_group_range()[0]); #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return linear_id_type(0); #endif } @@ -116,8 +109,7 @@ template class chunk { #ifdef __SYCL_DEVICE_ONLY__ return static_cast(get_local_range()[0]); #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return linear_id_type(0); #endif } @@ -125,8 +117,7 @@ template class chunk { #ifdef __SYCL_DEVICE_ONLY__ return get_local_linear_id() == 0; #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return linear_id_type(0); #endif } @@ -173,8 +164,7 @@ chunked_partition(ParentGroup parent) { return chunk(); #endif #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return chunk(); #endif } @@ -221,8 +211,7 @@ inline chunk::operator fragment() const { return fragment(mask, get_group_id(), get_group_range()); #endif #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return fragment(); #endif } diff --git a/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp b/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp index a5c082c5d124a..dbe74add3a021 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp @@ -62,8 +62,7 @@ template class fragment { #ifdef __SYCL_DEVICE_ONLY__ return GroupID; #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return id_type(0); #endif } @@ -71,8 +70,7 @@ template class fragment { #ifdef __SYCL_DEVICE_ONLY__ return sycl::detail::CallerPositionInMask(Mask); #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return id_type(0); #endif } @@ -80,8 +78,7 @@ template class fragment { #ifdef __SYCL_DEVICE_ONLY__ return GroupRange; #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return range_type(0); #endif } @@ -89,8 +86,7 @@ template class fragment { #ifdef __SYCL_DEVICE_ONLY__ return Mask.count(); #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return range_type(0); #endif } @@ -98,8 +94,7 @@ template class fragment { #ifdef __SYCL_DEVICE_ONLY__ return static_cast(get_group_id()[0]); #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return linear_id_type(0); #endif } @@ -107,8 +102,7 @@ template class fragment { #ifdef __SYCL_DEVICE_ONLY__ return static_cast(get_local_id()[0]); #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return linear_id_type(0); #endif } @@ -116,8 +110,7 @@ template class fragment { #ifdef __SYCL_DEVICE_ONLY__ return static_cast(get_group_range()[0]); #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return linear_id_type(0); #endif } @@ -125,8 +118,7 @@ template class fragment { #ifdef __SYCL_DEVICE_ONLY__ return static_cast(get_local_range()[0]); #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return linear_id_type(0); #endif } @@ -135,8 +127,7 @@ template class fragment { uint32_t Lowest = static_cast(Mask.find_low()[0]); return __spirv_SubgroupLocalInvocationId() == Lowest; #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return false; #endif } @@ -183,8 +174,7 @@ binary_partition(ParentGroup parent, bool predicate) { #endif #else (void)predicate; - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return fragment(); #endif } @@ -205,8 +195,7 @@ inline fragment get_opportunistic_group() { return fragment(mask, 0, 1); #endif #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return fragment(sub_group_mask(), id<1>(0), range<1>(1)); #endif } diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp index 5be649f985c8e..1324e0612d56d 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp @@ -47,8 +47,7 @@ template class tangle { #ifdef __SYCL_DEVICE_ONLY__ return static_cast(0); #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return id_type(0); #endif } @@ -56,8 +55,7 @@ template class tangle { #ifdef __SYCL_DEVICE_ONLY__ return sycl::detail::CallerPositionInMask(Mask); #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return id_type(0); #endif } @@ -65,8 +63,7 @@ template class tangle { #ifdef __SYCL_DEVICE_ONLY__ return 1; #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return range_type(0); #endif } @@ -74,8 +71,7 @@ template class tangle { #ifdef __SYCL_DEVICE_ONLY__ return Mask.count(); #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return range_type(0); #endif } @@ -83,8 +79,7 @@ template class tangle { #ifdef __SYCL_DEVICE_ONLY__ return static_cast(get_group_id()[0]); #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return linear_id_type(0); #endif } @@ -92,8 +87,7 @@ template class tangle { #ifdef __SYCL_DEVICE_ONLY__ return static_cast(get_local_id()[0]); #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return linear_id_type(0); #endif } @@ -101,8 +95,7 @@ template class tangle { #ifdef __SYCL_DEVICE_ONLY__ return static_cast(get_group_range()[0]); #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return linear_id_type(0); #endif } @@ -110,8 +103,7 @@ template class tangle { #ifdef __SYCL_DEVICE_ONLY__ return static_cast(get_local_range()[0]); #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return linear_id_type(0); #endif } @@ -120,8 +112,7 @@ template class tangle { uint32_t Lowest = static_cast(Mask.find_low()[0]); return __spirv_SubgroupLocalInvocationId() == Lowest; #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return false; #endif } @@ -167,8 +158,7 @@ entangle(ParentGroup parent) { return tangle(0); #endif #else - throw exception(make_error_code(errc::runtime), - "Non-uniform groups are not supported on host."); + return tangle(); #endif } diff --git a/sycl/test-e2e/NonUniformGroups/chunk.cpp b/sycl/test-e2e/NonUniformGroups/chunk.cpp index e74b49c9d0c3c..c760f3ab0157c 100644 --- a/sycl/test-e2e/NonUniformGroups/chunk.cpp +++ b/sycl/test-e2e/NonUniformGroups/chunk.cpp @@ -8,10 +8,11 @@ // // REQUIRES: cpu || gpu // REQUIRES: sg-32 +// REQUIRES: aspect-ext_oneapi_chunk #include #include -#include + namespace syclex = sycl::ext::oneapi::experimental; template class TestKernel; @@ -19,12 +20,6 @@ template class TestKernel; template void test() { sycl::queue Q; - if (!Q.get_device().has(sycl::aspect::ext_oneapi_chunk)) { - std::cout << "Device has no support for ext_oneapi_chunk aspect..." - << std::endl; - return; - } - // Test for both the full sub-group size and a case with less work than a full // sub-group. for (size_t WGS : std::array{32, 16}) { diff --git a/sycl/test-e2e/NonUniformGroups/fragment.cpp b/sycl/test-e2e/NonUniformGroups/fragment.cpp index 2117507f87af7..47e2a3f88d7be 100644 --- a/sycl/test-e2e/NonUniformGroups/fragment.cpp +++ b/sycl/test-e2e/NonUniformGroups/fragment.cpp @@ -7,6 +7,7 @@ // RUN: %if cpu %{ %{run} %t.x86.out %} // // REQUIRES: cpu || gpu +// REQUIRES: aspect-ext_oneapi_fragment #include #include @@ -18,12 +19,6 @@ class TestKernel; int main() { sycl::queue Q; - if (!Q.get_device().has(sycl::aspect::ext_oneapi_fragment)) { - std::cout << "Device has no support for ext_oneapi_fragment aspect..." - << std::endl; - return 0; - } - auto SGSizes = Q.get_device().get_info(); if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { std::cout << "Test skipped due to missing support for sub-group size 32." diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic.cpp index d0133a0d9cb96..730148275212c 100644 --- a/sycl/test-e2e/NonUniformGroups/opportunistic.cpp +++ b/sycl/test-e2e/NonUniformGroups/opportunistic.cpp @@ -7,6 +7,7 @@ // RUN: %if cpu %{ %{run} %t.x86.out %} // // REQUIRES: cpu || gpu +// REQUIRES: aspect-ext_oneapi_fragment #include #include @@ -18,12 +19,6 @@ class TestKernel; int main() { sycl::queue Q; - if (!Q.get_device().has(sycl::aspect::ext_oneapi_fragment)) { - std::cout << "Device has no support for ext_oneapi_fragment aspect..." - << std::endl; - return 0; - } - auto SGSizes = Q.get_device().get_info(); if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { std::cout << "Test skipped due to missing support for sub-group size 32." diff --git a/sycl/test-e2e/NonUniformGroups/tangle.cpp b/sycl/test-e2e/NonUniformGroups/tangle.cpp index a6ead14344ba9..2736ddbc02024 100644 --- a/sycl/test-e2e/NonUniformGroups/tangle.cpp +++ b/sycl/test-e2e/NonUniformGroups/tangle.cpp @@ -7,6 +7,7 @@ // RUN: %if cpu %{ %{run} %t.x86.out %} // // REQUIRES: cpu || gpu +// REQUIRES: aspect-ext_oneapi_tangle // UNSUPPORTED: target-nvidia || target-amd #include @@ -19,12 +20,6 @@ class TestKernel; int main() { sycl::queue Q; - if (!Q.get_device().has(sycl::aspect::ext_oneapi_tangle)) { - std::cout << "Device has no support for ext_oneapi_tangle aspect..." - << std::endl; - return 0; - } - auto SGSizes = Q.get_device().get_info(); if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { std::cout << "Test skipped due to missing support for sub-group size 32." From a7d9fb685a39643ea0f07cbef0c13469f44ae92c Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Wed, 2 Jul 2025 11:25:53 +0200 Subject: [PATCH 08/32] [SYCL] Remove rest of fixed_size_group leftover + refactored macros which made 4 templates fro one call --- sycl/include/sycl/detail/spirv.hpp | 172 +++++++---------------- sycl/include/sycl/detail/type_traits.hpp | 5 - 2 files changed, 47 insertions(+), 130 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index da2323d56c519..4159af55efb12 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -16,6 +16,7 @@ // Therefore, we need the following include to get forward-declarations of those // versions. #include +#include #include // for IdToMaskPosition @@ -24,7 +25,7 @@ #endif #include // sycl::detail::memcpy -#include // is_fixed_size_group traits +#include namespace sycl { inline namespace _V1 { @@ -34,7 +35,7 @@ namespace oneapi { struct sub_group; namespace experimental { template class fragment; -template class fixed_size_group; + template class root_group; template class tangle; template class fragment; @@ -74,12 +75,6 @@ struct is_tangle_group> template struct is_ballot_group : std::false_type {}; -template -struct detail::is_fixed_size_group< - sycl::ext::oneapi::experimental::fixed_size_group> - : std::true_type {}; - template struct is_fragment : std::false_type {}; template @@ -115,12 +110,6 @@ struct group_scope> { static constexpr __spv::Scope::Flag value = group_scope::value; }; -template -struct group_scope> { - static constexpr __spv::Scope::Flag value = group_scope::value; -}; - template struct group_scope> { static constexpr __spv::Scope::Flag value = group_scope::value; @@ -194,16 +183,6 @@ bool GroupAll(ext::oneapi::experimental::fragment g, bool pred) { // group_scope::value, // pred); } -template -bool GroupAll( - ext::oneapi::experimental::fixed_size_group, - bool pred) { - // GroupNonUniformAll doesn't support cluster size, so use a reduction - return __spirv_GroupNonUniformBitwiseAnd( - group_scope::value, - static_cast(__spv::GroupOperation::ClusteredReduce), - static_cast(pred), PartitionSize); -} template bool GroupAll(ext::oneapi::experimental::tangle, bool pred) { return __spirv_GroupNonUniformAll(group_scope::value, pred); @@ -244,16 +223,6 @@ bool GroupAny(ext::oneapi::experimental::fragment g, bool pred) { // group_scope::value, // pred); } -template -bool GroupAny( - ext::oneapi::experimental::fixed_size_group, - bool pred) { - // GroupNonUniformAny doesn't support cluster size, so use a reduction - return __spirv_GroupNonUniformBitwiseOr( - group_scope::value, - static_cast(__spv::GroupOperation::ClusteredReduce), - static_cast(pred), PartitionSize); -} template bool GroupAny(ext::oneapi::experimental::tangle, bool pred) { return __spirv_GroupNonUniformAny(group_scope::value, pred); @@ -378,26 +347,6 @@ GroupBroadcast(sycl::ext::oneapi::experimental::fragment g, T x, // WideOCLX, // OCLId); } -template -EnableIfNativeBroadcast GroupBroadcast( - ext::oneapi::experimental::fixed_size_group g, - T x, IdT local_id) { - // Remap local_id to its original numbering in ParentGroup - auto LocalId = g.get_group_linear_id() * PartitionSize + local_id; - - // TODO: Refactor to avoid duplication after design settles. - auto GroupLocalId = static_cast::type>(LocalId); - auto OCLX = detail::convertToOpenCLType(x); - WidenOpenCLTypeTo32_t WideOCLX = OCLX; - auto OCLId = detail::convertToOpenCLType(GroupLocalId); - - // NonUniformBroadcast requires Id to be dynamically uniform, which does not - // hold here; each partition is broadcasting a separate index. We could - // fallback to either NonUniformShuffle or a NonUniformBroadcast per - // partition, and it's unclear which will be faster in practice. - return __spirv_GroupNonUniformShuffle(group_scope::value, - WideOCLX, OCLId); -} template EnableIfNativeBroadcast GroupBroadcast(ext::oneapi::experimental::tangle g, T x, @@ -956,8 +905,7 @@ inline uint32_t MapShuffleID(GroupT g, id<1> local_id) { if constexpr (is_tangle_group::value || is_ballot_group::value || is_fragment::value) return detail::IdToMaskPosition(g, local_id); - else if constexpr (detail::is_fixed_size_group::value || - is_chunk::value) + else if constexpr (is_chunk::value) return g.get_group_linear_id() * g.get_local_range().size() + local_id; else return local_id.get(0); @@ -1053,7 +1001,7 @@ EnableIfNativeShuffle ShuffleXor(GroupT g, T x, id<1> mask) { if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< GroupT>) { auto MemberMask = detail::ExtractMask(detail::GetMask(g))[0]; - if constexpr (detail::is_fixed_size_group_v) { + if constexpr (is_chunk::value) { return cuda_shfl_sync_bfly_i32(MemberMask, x, static_cast(mask.get(0)), 0x1f); } else { @@ -1100,7 +1048,7 @@ EnableIfNativeShuffle ShuffleDown(GroupT g, T x, uint32_t delta) { if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< GroupT>) { auto MemberMask = detail::ExtractMask(detail::GetMask(g))[0]; - if constexpr (detail::is_fixed_size_group_v) { + if constexpr (is_chunk::value) { return cuda_shfl_sync_down_i32(MemberMask, x, delta, 31); } else { unsigned localSetBit = g.get_local_id()[0] + 1; @@ -1144,7 +1092,7 @@ EnableIfNativeShuffle ShuffleUp(GroupT g, T x, uint32_t delta) { if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< GroupT>) { auto MemberMask = detail::ExtractMask(detail::GetMask(g))[0]; - if constexpr (detail::is_fixed_size_group_v) { + if constexpr (is_chunk::value) { return cuda_shfl_sync_up_i32(MemberMask, x, delta, 0); } else { unsigned localSetBit = g.get_local_id()[0] + 1; @@ -1340,8 +1288,10 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { OCLT Ret = __spirv_Group##Instruction##GroupExt( \ group_scope::value, static_cast(Op), Arg); \ return Ret; \ - } \ - \ + } + +// Define separate macros for each template to avoid nesting issues +#define __SYCL_GROUP_COLLECTIVE_FRAGMENT(Instruction, GroupExt) \ template <__spv::GroupOperation Op, typename ParentGroup, typename T> \ inline T Group##Instruction( \ ext::oneapi::experimental::fragment g, T x) { \ @@ -1365,53 +1315,10 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { } else { \ return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \ } \ - // clang-format off \ - /* TODO: add support for partitioning into more than two groups \ - rewrite as a loop over g.get_group_count(). \ - for (size_t i{}; i != group_count; ++i) \ - if (g.get_group_id() == i) \ - return __spirv_GroupNonUniform##Instruction( \ - group_scope::value, \ - OpInt, Arg);*/ \ - // clang-format on \ - } \ - \ - template <__spv::GroupOperation Op, size_t PartitionSize, \ - typename ParentGroup, typename T> \ - inline T Group##Instruction( \ - ext::oneapi::experimental::fixed_size_group \ - g, \ - T x) { \ - using ConvertedT = detail::ConvertToOpenCLType_t; \ - \ - using OCLT = std::conditional_t< \ - std::is_same() || \ - std::is_same(), \ - opencl::cl_int, \ - std::conditional_t() || \ - std::is_same(), \ - opencl::cl_uint, ConvertedT>>; \ - OCLT Arg = x; \ - constexpr auto Scope = group_scope::value; \ - /* SPIR-V only defines a ClusteredReduce, with no equivalents for scan. */ \ - /* Emulate Clustered*Scan using control flow to separate clusters. */ \ - if constexpr (Op == __spv::GroupOperation::Reduce) { \ - constexpr auto OpInt = \ - static_cast(__spv::GroupOperation::ClusteredReduce); \ - return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg, \ - PartitionSize); \ - } else { \ - T tmp; \ - for (size_t Cluster = 0; Cluster < g.get_group_linear_range(); \ - ++Cluster) { \ - if (Cluster == g.get_group_linear_id()) { \ - constexpr auto OpInt = static_cast(Op); \ - tmp = __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \ - } \ - } \ - return tmp; \ - } \ - } \ + /* TODO: add support for partitioning into more than two groups */ \ + } + +#define __SYCL_GROUP_COLLECTIVE_CHUNK(Instruction, GroupExt) \ template <__spv::GroupOperation Op, size_t ChunkSize, typename ParentGroup, \ typename T> \ inline T Group##Instruction( \ @@ -1446,7 +1353,9 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { } \ return tmp; \ } \ - } \ + } + +#define __SYCL_GROUP_COLLECTIVE_TANGLE(Instruction, GroupExt) \ template <__spv::GroupOperation Op, typename Group, typename T> \ inline typename std::enable_if_t::value, T> \ Group##Instruction(Group, T x) { \ @@ -1465,27 +1374,40 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { return Ret; \ } -__SYCL_GROUP_COLLECTIVE_OVERLOAD(SMin, ) -__SYCL_GROUP_COLLECTIVE_OVERLOAD(UMin, ) -__SYCL_GROUP_COLLECTIVE_OVERLOAD(FMin, ) +// Now use all 4 macros together for each instruction +#define __SYCL_GROUP_COLLECTIVE_ALL(Instruction, GroupExt) \ + __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction, GroupExt) \ + __SYCL_GROUP_COLLECTIVE_FRAGMENT(Instruction, GroupExt) \ + __SYCL_GROUP_COLLECTIVE_CHUNK(Instruction, GroupExt) \ + __SYCL_GROUP_COLLECTIVE_TANGLE(Instruction, GroupExt) + +__SYCL_GROUP_COLLECTIVE_ALL(SMin, ) +__SYCL_GROUP_COLLECTIVE_ALL(UMin, ) +__SYCL_GROUP_COLLECTIVE_ALL(FMin, ) + +__SYCL_GROUP_COLLECTIVE_ALL(SMax, ) +__SYCL_GROUP_COLLECTIVE_ALL(UMax, ) +__SYCL_GROUP_COLLECTIVE_ALL(FMax, ) -__SYCL_GROUP_COLLECTIVE_OVERLOAD(SMax, ) -__SYCL_GROUP_COLLECTIVE_OVERLOAD(UMax, ) -__SYCL_GROUP_COLLECTIVE_OVERLOAD(FMax, ) +__SYCL_GROUP_COLLECTIVE_ALL(IAdd, ) +__SYCL_GROUP_COLLECTIVE_ALL(FAdd, ) -__SYCL_GROUP_COLLECTIVE_OVERLOAD(IAdd, ) -__SYCL_GROUP_COLLECTIVE_OVERLOAD(FAdd, ) +__SYCL_GROUP_COLLECTIVE_ALL(IMul, KHR) +__SYCL_GROUP_COLLECTIVE_ALL(FMul, KHR) +__SYCL_GROUP_COLLECTIVE_ALL(CMulINTEL, ) -__SYCL_GROUP_COLLECTIVE_OVERLOAD(IMul, KHR) -__SYCL_GROUP_COLLECTIVE_OVERLOAD(FMul, KHR) -__SYCL_GROUP_COLLECTIVE_OVERLOAD(CMulINTEL, ) +__SYCL_GROUP_COLLECTIVE_ALL(BitwiseOr, KHR) +__SYCL_GROUP_COLLECTIVE_ALL(BitwiseXor, KHR) +__SYCL_GROUP_COLLECTIVE_ALL(BitwiseAnd, KHR) -__SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseOr, KHR) -__SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseXor, KHR) -__SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseAnd, KHR) +__SYCL_GROUP_COLLECTIVE_ALL(LogicalAnd, KHR) +__SYCL_GROUP_COLLECTIVE_ALL(LogicalOr, KHR) -__SYCL_GROUP_COLLECTIVE_OVERLOAD(LogicalAnd, KHR) -__SYCL_GROUP_COLLECTIVE_OVERLOAD(LogicalOr, KHR) +#undef __SYCL_GROUP_COLLECTIVE_OVERLOAD +#undef __SYCL_GROUP_COLLECTIVE_FRAGMENT +#undef __SYCL_GROUP_COLLECTIVE_CHUNK +#undef __SYCL_GROUP_COLLECTIVE_TANGLE +#undef __SYCL_GROUP_COLLECTIVE_ALL } // namespace spirv } // namespace detail diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp index fad5dda896ee7..b248c4595cf18 100644 --- a/sycl/include/sycl/detail/type_traits.hpp +++ b/sycl/include/sycl/detail/type_traits.hpp @@ -14,16 +14,11 @@ #include // for array #include // for size_t -#include // for tuple #include // for true_type, false_type namespace sycl { inline namespace _V1 { namespace detail { -template struct is_fixed_size_group : std::false_type {}; - -template -inline constexpr bool is_fixed_size_group_v = is_fixed_size_group::value; } // namespace detail template class group; From 84d96366dc13e9d908c9b5e48fb0984563b99ca8 Mon Sep 17 00:00:00 2001 From: AndreiZibrov Date: Wed, 2 Jul 2025 11:26:46 +0200 Subject: [PATCH 09/32] Update sycl/include/sycl/ext/oneapi/experimental/tangle.hpp Co-authored-by: John Pennycook --- sycl/include/sycl/ext/oneapi/experimental/tangle.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp index 1324e0612d56d..be2566f008c0b 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp @@ -148,8 +148,6 @@ entangle(ParentGroup parent) { // and so the group of work-items in converged control flow is implicit. // We store the mask here because it is required to calculate IDs, not // because it is required to construct the group. - - // mask is required to calculate IDs (not the group) sub_group_mask mask = sycl::ext::oneapi::group_ballot(parent, true); return tangle(mask); #elif defined(__NVPTX__) From 78208f89c7036814a1bf650b4b851df979734719 Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Wed, 2 Jul 2025 11:35:18 +0200 Subject: [PATCH 10/32] [SYCL] Fixed formatting after refactoring --- sycl/include/sycl/detail/spirv.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 4159af55efb12..0dcce5db2b71a 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -24,7 +24,7 @@ #include #endif -#include // sycl::detail::memcpy +#include // sycl::detail::memcpy #include namespace sycl { @@ -1318,7 +1318,7 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { /* TODO: add support for partitioning into more than two groups */ \ } -#define __SYCL_GROUP_COLLECTIVE_CHUNK(Instruction, GroupExt) \ +#define __SYCL_GROUP_COLLECTIVE_CHUNK(Instruction, GroupExt) \ template <__spv::GroupOperation Op, size_t ChunkSize, typename ParentGroup, \ typename T> \ inline T Group##Instruction( \ @@ -1375,10 +1375,10 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { } // Now use all 4 macros together for each instruction -#define __SYCL_GROUP_COLLECTIVE_ALL(Instruction, GroupExt) \ - __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction, GroupExt) \ - __SYCL_GROUP_COLLECTIVE_FRAGMENT(Instruction, GroupExt) \ - __SYCL_GROUP_COLLECTIVE_CHUNK(Instruction, GroupExt) \ +#define __SYCL_GROUP_COLLECTIVE_ALL(Instruction, GroupExt) \ + __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction, GroupExt) \ + __SYCL_GROUP_COLLECTIVE_FRAGMENT(Instruction, GroupExt) \ + __SYCL_GROUP_COLLECTIVE_CHUNK(Instruction, GroupExt) \ __SYCL_GROUP_COLLECTIVE_TANGLE(Instruction, GroupExt) __SYCL_GROUP_COLLECTIVE_ALL(SMin, ) From 63c2ec8fbca2301961aa7f865b45c745454937dd Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Wed, 2 Jul 2025 12:01:57 +0200 Subject: [PATCH 11/32] [SYCL] Remove left empty namespace + fix empty lines / comment formatting --- sycl/include/sycl/detail/spirv.hpp | 18 +++++++++--------- sycl/include/sycl/detail/type_traits.hpp | 4 +--- .../sycl/ext/oneapi/experimental/tangle.hpp | 3 +-- .../non-uniform-groups/is_user_constructed.cpp | 2 +- .../HasExtensionWordBoundary.cpp | 2 +- 5 files changed, 13 insertions(+), 16 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 0dcce5db2b71a..ddbc6ab3ed8b6 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -175,9 +175,9 @@ bool GroupAll(ext::oneapi::experimental::fragment g, bool pred) { // TODO: adding support for fragments have partitioning into more than two // groups such as labeled_partition: // - // 1. add size_t framgment::get_group_count() const definition - // 2. const auto group_count{g.get_group_count()}; - // 3. for (size_t i{}; i != group_count; ++i) + // 1. add size_t framgment::get_group_range() const definition + // 2. const auto group_range{g.get_group_range()}; + // 3. for (size_t i{}; i != group_range; ++i) // if (g.get_group_id() == i) // return __spirv_GroupNonUniformAll( // group_scope::value, @@ -215,9 +215,9 @@ bool GroupAny(ext::oneapi::experimental::fragment g, bool pred) { // TODO: adding support for fragments have partitioning into more than two // groups such as labeled_partition: // - // 1. add size_t framgment::get_group_count() const definition - // 2. const auto group_count{g.get_group_count()}; - // 3. for (size_t i{}; i != group_count; ++i) + // 1. add size_t framgment::get_group_range() const definition + // 2. const auto group_range{g.get_group_range()}; + // 3. for (size_t i{}; i != group_range; ++i) // if (g.get_group_id() == i) // return __spirv_GroupNonUniformAny( // group_scope::value, @@ -338,9 +338,9 @@ GroupBroadcast(sycl::ext::oneapi::experimental::fragment g, T x, // TODO: adding support for fragments have partitioning into more than two // groups such as labeled_partition: // - // 1. add size_t framgment::get_group_count() const definition - // 2. const auto group_count{g.get_group_count()}; - // 3. for (size_t i{}; i != group_count; ++i) + // 1. add size_t framgment::get_group_range() const definition + // 2. const auto group_range{g.get_group_range()}; + // 3. for (size_t i{}; i != group_range; ++i) // if (g.get_group_id() == i) // return __spirv_GroupNonUniformBroadcast( // group_scope::value, diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp index b248c4595cf18..5dcbf6abcc140 100644 --- a/sycl/include/sycl/detail/type_traits.hpp +++ b/sycl/include/sycl/detail/type_traits.hpp @@ -18,8 +18,6 @@ namespace sycl { inline namespace _V1 { -namespace detail { -} // namespace detail template class group; struct sub_group; @@ -149,7 +147,7 @@ template struct get_elem_type_unqual> { template class OperationCurrentT, int... Indexes> struct get_elem_type_unqual> { + OperationCurrentT, Indexes...>> { using type = typename get_elem_type_unqual>::type; }; #else diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp index be2566f008c0b..1cf4daab9f5f9 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp @@ -1,5 +1,4 @@ -//==------------- tangle.hpp --- SYCL extension for non-uniform groups -//------==// +//==--------- tangle.hpp --- SYCL extension for non-uniform groups ---------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/sycl/test/non-uniform-groups/is_user_constructed.cpp b/sycl/test/non-uniform-groups/is_user_constructed.cpp index a3df70c4d18a8..c71e686049f1d 100644 --- a/sycl/test/non-uniform-groups/is_user_constructed.cpp +++ b/sycl/test/non-uniform-groups/is_user_constructed.cpp @@ -18,4 +18,4 @@ static_assert(is_user_constructed_group>); static_assert(is_user_constructed_group>); // sub_group itself is NOT user-constructed -static_assert(not is_user_constructed_group); \ No newline at end of file +static_assert(not is_user_constructed_group); diff --git a/sycl/unittests/context_device/HasExtensionWordBoundary.cpp b/sycl/unittests/context_device/HasExtensionWordBoundary.cpp index bad3ebfd47cd8..9879066445e79 100644 --- a/sycl/unittests/context_device/HasExtensionWordBoundary.cpp +++ b/sycl/unittests/context_device/HasExtensionWordBoundary.cpp @@ -122,4 +122,4 @@ TEST_F(HasExtensionWordBoundaryTest, NonUniformGroupExtensions) { EXPECT_FALSE(DevImpl->has_extension("cl_intel_subgroup")); // Missing 's' EXPECT_FALSE(DevImpl->has_extension("non_uniform_vote")); // Missing prefix EXPECT_FALSE(DevImpl->has_extension("subgroup_matrix_multiply")); -} \ No newline at end of file +} From 92bddb31a41ee7ca5f10b2153ed5ef11e1c03b4f Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Wed, 2 Jul 2025 13:04:50 +0200 Subject: [PATCH 12/32] [SYCL] Fix unreached leftovers didn't reach by tests recently --- .../cuda/non_uniform_algorithms.hpp | 28 +++++++-------- .../sycl/ext/oneapi/experimental/fragment.hpp | 2 +- sycl/source/detail/device_impl.hpp | 3 +- .../HasExtensionWordBoundary.cpp | 34 +++++++++++++++---- 4 files changed, 44 insertions(+), 23 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index 4b760cf637036..3b323229132f0 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -57,8 +57,8 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, } template -std::enable_if_t<(is_sugeninteger_v || - is_sigeninteger_v)&&IsPlus::value, +std::enable_if_t<(is_sugeninteger_v || is_sigeninteger_v) && + IsPlus::value, T> masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -66,8 +66,8 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, } template -std::enable_if_t<(is_sugeninteger_v || - is_sigeninteger_v)&&IsBitAND::value, +std::enable_if_t<(is_sugeninteger_v || is_sigeninteger_v) && + IsBitAND::value, T> masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -75,8 +75,8 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, } template -std::enable_if_t<(is_sugeninteger_v || - is_sigeninteger_v)&&IsBitOR::value, +std::enable_if_t<(is_sugeninteger_v || is_sigeninteger_v) && + IsBitOR::value, T> masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -84,8 +84,8 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, } template -std::enable_if_t<(is_sugeninteger_v || - is_sigeninteger_v)&&IsBitXOR::value, +std::enable_if_t<(is_sugeninteger_v || is_sigeninteger_v) && + IsBitXOR::value, T> masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -95,9 +95,9 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, //// Shuffle based masked reduction impls -// fixed_size_group group reduction using shfls +// chunk group reduction using shfls template -inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> +inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { @@ -111,7 +111,7 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, template inline __SYCL_ALWAYS_INLINE std::enable_if_t< ext::oneapi::experimental::is_user_constructed_group_v && - !is_fixed_size_group_v, + !is_chunk_v, T> masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -208,10 +208,10 @@ inline __SYCL_ALWAYS_INLINE //// Shuffle based masked reduction impls -// fixed_size_group group scan using shfls +// chunk group scan using shfls template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation> -inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> +inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { unsigned localIdVal = g.get_local_id()[0]; @@ -233,7 +233,7 @@ template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation> inline __SYCL_ALWAYS_INLINE std::enable_if_t< ext::oneapi::experimental::is_user_constructed_group_v && - !is_fixed_size_group_v, + !is_chunk_v, T> masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { diff --git a/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp b/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp index dbe74add3a021..fc0dca7b2a395 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp @@ -174,7 +174,7 @@ binary_partition(ParentGroup parent, bool predicate) { #endif #else (void)predicate; - return fragment(); + return fragment(sub_group_mask(), id<1>(0), range<1>(1)); #endif } diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp index 565cbb150f5e4..e52c61d83fb04 100644 --- a/sycl/source/detail/device_impl.hpp +++ b/sycl/source/detail/device_impl.hpp @@ -1445,8 +1445,7 @@ class device_impl : public std::enable_shared_from_this { } CASE(ext_oneapi_tangle) { return (this->getBackend() == backend::ext_oneapi_level_zero) || - (this->getBackend() == backend::opencl) || - (this->getBackend() == backend::ext_oneapi_cuda); + (this->getBackend() == backend::opencl); } CASE(ext_intel_matrix) { using arch = sycl::ext::oneapi::experimental::architecture; diff --git a/sycl/unittests/context_device/HasExtensionWordBoundary.cpp b/sycl/unittests/context_device/HasExtensionWordBoundary.cpp index 9879066445e79..fec0ed1d37bc9 100644 --- a/sycl/unittests/context_device/HasExtensionWordBoundary.cpp +++ b/sycl/unittests/context_device/HasExtensionWordBoundary.cpp @@ -43,7 +43,7 @@ static ur_result_t redefinedDeviceGetInfo(void *pParams) { class HasExtensionWordBoundaryTest : public ::testing::Test { public: - HasExtensionWordBoundaryTest() : Mock{}, Plt{sycl::platform()} {} + HasExtensionWordBoundaryTest() : Mock{} {} protected: void SetUp() override { @@ -57,13 +57,12 @@ class HasExtensionWordBoundaryTest : public ::testing::Test { } sycl::unittest::UrMock<> Mock; - sycl::platform Plt; }; TEST_F(HasExtensionWordBoundaryTest, ExactMatchWorks) { MockExtensions = "cl_khr_fp64 cl_intel_subgroups cl_khr_subgroups"; - Plt = sycl::platform(); - + + sycl::platform Plt{sycl::platform()}; sycl::device Dev = Plt.get_devices()[0]; auto DevImpl = detail::getSyclObjImpl(Dev); @@ -74,6 +73,8 @@ TEST_F(HasExtensionWordBoundaryTest, ExactMatchWorks) { TEST_F(HasExtensionWordBoundaryTest, SubstringDoesNotMatch) { MockExtensions = "cl_intel_subgroups cl_khr_fp64_extended"; + + sycl::platform Plt{sycl::platform()}; sycl::device Dev = Plt.get_devices()[0]; auto DevImpl = detail::getSyclObjImpl(Dev); @@ -86,15 +87,34 @@ TEST_F(HasExtensionWordBoundaryTest, SubstringDoesNotMatch) { EXPECT_FALSE(DevImpl->has_extension("intel_subgroups")); // Partial match } -TEST_F(HasExtensionWordBoundaryTest, EdgeCases) { +TEST_F(HasExtensionWordBoundaryTest, EmptyExtensions) { + MockExtensions = ""; + + sycl::platform Plt{sycl::platform()}; sycl::device Dev = Plt.get_devices()[0]; auto DevImpl = detail::getSyclObjImpl(Dev); - MockExtensions = ""; + EXPECT_FALSE(DevImpl->has_extension("cl_khr_fp64")); +} + +TEST_F(HasExtensionWordBoundaryTest, SingleExtension) { MockExtensions = "cl_khr_fp64"; + + sycl::platform Plt{sycl::platform()}; + sycl::device Dev = Plt.get_devices()[0]; + auto DevImpl = detail::getSyclObjImpl(Dev); + EXPECT_TRUE(DevImpl->has_extension("cl_khr_fp64")); EXPECT_FALSE(DevImpl->has_extension("cl_khr_fp6")); // Substring +} + +TEST_F(HasExtensionWordBoundaryTest, FirstMiddleLastExtensions) { MockExtensions = "cl_first_ext cl_middle_ext cl_last_ext"; + + sycl::platform Plt{sycl::platform()}; + sycl::device Dev = Plt.get_devices()[0]; + auto DevImpl = detail::getSyclObjImpl(Dev); + EXPECT_TRUE(DevImpl->has_extension("cl_first_ext")); EXPECT_TRUE(DevImpl->has_extension("cl_middle_ext")); EXPECT_TRUE(DevImpl->has_extension("cl_last_ext")); @@ -106,6 +126,8 @@ TEST_F(HasExtensionWordBoundaryTest, NonUniformGroupExtensions) { "cl_intel_subgroups " "cl_intel_spirv_subgroups " "cl_intel_subgroup_matrix_multiply_accumulate"; + + sycl::platform Plt{sycl::platform()}; sycl::device Dev = Plt.get_devices()[0]; auto DevImpl = detail::getSyclObjImpl(Dev); From 6ba16c9a06df20bf4377a9851350e4f6755a1e9f Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Wed, 2 Jul 2025 14:05:49 +0200 Subject: [PATCH 13/32] [SYCL] move has_extension fix to separate PR --- sycl/source/detail/device_impl.cpp | 3 +- sycl/unittests/context_device/CMakeLists.txt | 1 - .../HasExtensionWordBoundary.cpp | 147 ------------------ 3 files changed, 1 insertion(+), 150 deletions(-) delete mode 100644 sycl/unittests/context_device/HasExtensionWordBoundary.cpp diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index e9988510704df..a7729d19ce3e2 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -109,8 +109,7 @@ device_impl::get_backend_info() const { bool device_impl::has_extension(const std::string &ExtensionName) const { std::string AllExtensionNames = get_info_impl(); - return ((" " + AllExtensionNames + " ").find(" " + ExtensionName + " ") != - std::string::npos); + return (AllExtensionNames.find(ExtensionName) != std::string::npos); } bool device_impl::is_partition_supported(info::partition_property Prop) const { diff --git a/sycl/unittests/context_device/CMakeLists.txt b/sycl/unittests/context_device/CMakeLists.txt index ff435ec68a8d7..02488fae878bd 100644 --- a/sycl/unittests/context_device/CMakeLists.txt +++ b/sycl/unittests/context_device/CMakeLists.txt @@ -1,5 +1,4 @@ add_sycl_unittest(ContextDeviceTests OBJECT Context.cpp DeviceRefCounter.cpp - HasExtensionWordBoundary.cpp ) diff --git a/sycl/unittests/context_device/HasExtensionWordBoundary.cpp b/sycl/unittests/context_device/HasExtensionWordBoundary.cpp deleted file mode 100644 index fec0ed1d37bc9..0000000000000 --- a/sycl/unittests/context_device/HasExtensionWordBoundary.cpp +++ /dev/null @@ -1,147 +0,0 @@ -//==---- HasExtensionWordBoundary.cpp --- Test word boundary fix ----------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This test verifies that has_extension correctly matches full extension names -// and doesn't match partial substrings. -// -//===----------------------------------------------------------------------===// - -#include -#include -#include -#include -#include - -using namespace sycl; - -static std::string MockExtensions = ""; - -static ur_result_t redefinedDeviceGetInfo(void *pParams) { - auto params = *static_cast(pParams); - - if (*params.ppropName == UR_DEVICE_INFO_EXTENSIONS) { - // override extensions query with mock data - if (*params.ppPropValue) { - size_t len = MockExtensions.length() + 1; - if (*params.ppropSize >= len) - std::memcpy(*params.ppPropValue, MockExtensions.c_str(), len); - } - if (*params.ppPropSizeRet) - **params.ppPropSizeRet = MockExtensions.length() + 1; - - return UR_RESULT_SUCCESS; - } - - // delegate to the default mock - return sycl::unittest::MockAdapter::mock_urDeviceGetInfo(pParams); -} - -class HasExtensionWordBoundaryTest : public ::testing::Test { -public: - HasExtensionWordBoundaryTest() : Mock{} {} - -protected: - void SetUp() override { - mock::getCallbacks().set_replace_callback("urDeviceGetInfo", - &redefinedDeviceGetInfo); - } - - void TearDown() override { - // nullptr to use default mock - mock::getCallbacks().set_replace_callback("urDeviceGetInfo", nullptr); - } - - sycl::unittest::UrMock<> Mock; -}; - -TEST_F(HasExtensionWordBoundaryTest, ExactMatchWorks) { - MockExtensions = "cl_khr_fp64 cl_intel_subgroups cl_khr_subgroups"; - - sycl::platform Plt{sycl::platform()}; - sycl::device Dev = Plt.get_devices()[0]; - auto DevImpl = detail::getSyclObjImpl(Dev); - - EXPECT_TRUE(DevImpl->has_extension("cl_khr_fp64")); - EXPECT_TRUE(DevImpl->has_extension("cl_intel_subgroups")); - EXPECT_TRUE(DevImpl->has_extension("cl_khr_subgroups")); -} - -TEST_F(HasExtensionWordBoundaryTest, SubstringDoesNotMatch) { - MockExtensions = "cl_intel_subgroups cl_khr_fp64_extended"; - - sycl::platform Plt{sycl::platform()}; - sycl::device Dev = Plt.get_devices()[0]; - auto DevImpl = detail::getSyclObjImpl(Dev); - - // These should NOT match because they're substrings - EXPECT_FALSE(DevImpl->has_extension("cl_intel_subgroup")); // Missing 's' - - // Would match in old implementation - EXPECT_FALSE(DevImpl->has_extension("cl_khr_fp64")); - EXPECT_FALSE(DevImpl->has_extension("subgroups")); // Partial match - EXPECT_FALSE(DevImpl->has_extension("intel_subgroups")); // Partial match -} - -TEST_F(HasExtensionWordBoundaryTest, EmptyExtensions) { - MockExtensions = ""; - - sycl::platform Plt{sycl::platform()}; - sycl::device Dev = Plt.get_devices()[0]; - auto DevImpl = detail::getSyclObjImpl(Dev); - - EXPECT_FALSE(DevImpl->has_extension("cl_khr_fp64")); -} - -TEST_F(HasExtensionWordBoundaryTest, SingleExtension) { - MockExtensions = "cl_khr_fp64"; - - sycl::platform Plt{sycl::platform()}; - sycl::device Dev = Plt.get_devices()[0]; - auto DevImpl = detail::getSyclObjImpl(Dev); - - EXPECT_TRUE(DevImpl->has_extension("cl_khr_fp64")); - EXPECT_FALSE(DevImpl->has_extension("cl_khr_fp6")); // Substring -} - -TEST_F(HasExtensionWordBoundaryTest, FirstMiddleLastExtensions) { - MockExtensions = "cl_first_ext cl_middle_ext cl_last_ext"; - - sycl::platform Plt{sycl::platform()}; - sycl::device Dev = Plt.get_devices()[0]; - auto DevImpl = detail::getSyclObjImpl(Dev); - - EXPECT_TRUE(DevImpl->has_extension("cl_first_ext")); - EXPECT_TRUE(DevImpl->has_extension("cl_middle_ext")); - EXPECT_TRUE(DevImpl->has_extension("cl_last_ext")); -} - -TEST_F(HasExtensionWordBoundaryTest, NonUniformGroupExtensions) { - MockExtensions = "cl_khr_subgroup_non_uniform_vote " - "cl_khr_subgroup_ballot " - "cl_intel_subgroups " - "cl_intel_spirv_subgroups " - "cl_intel_subgroup_matrix_multiply_accumulate"; - - sycl::platform Plt{sycl::platform()}; - sycl::device Dev = Plt.get_devices()[0]; - auto DevImpl = detail::getSyclObjImpl(Dev); - - // should match (real extensions) - EXPECT_TRUE(DevImpl->has_extension("cl_khr_subgroup_non_uniform_vote")); - EXPECT_TRUE(DevImpl->has_extension("cl_khr_subgroup_ballot")); - EXPECT_TRUE(DevImpl->has_extension("cl_intel_subgroups")); - EXPECT_TRUE(DevImpl->has_extension("cl_intel_spirv_subgroups")); - EXPECT_TRUE( - DevImpl->has_extension("cl_intel_subgroup_matrix_multiply_accumulate")); - - // next should NOT match (substrings that would match with old impl.) - EXPECT_FALSE(DevImpl->has_extension("cl_khr_subgroup")); - EXPECT_FALSE(DevImpl->has_extension("cl_intel_subgroup")); // Missing 's' - EXPECT_FALSE(DevImpl->has_extension("non_uniform_vote")); // Missing prefix - EXPECT_FALSE(DevImpl->has_extension("subgroup_matrix_multiply")); -} From 0d3da76cd65808dc21cd59686bf514039d8e5f9a Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Wed, 2 Jul 2025 14:39:29 +0200 Subject: [PATCH 14/32] [SYCL] Fix missing include --- .../sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index 3b323229132f0..da88ee3a0e811 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -10,6 +10,7 @@ #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) #include "masked_shuffles.hpp" +#include "../non_uniform_groups.hpp" namespace sycl { inline namespace _V1 { From c5a48c96be86df17155a524e5e5e158b6e3497be Mon Sep 17 00:00:00 2001 From: AndreiZibrov Date: Wed, 2 Jul 2025 14:52:37 +0200 Subject: [PATCH 15/32] [SYCL] bringing back missed comment --- sycl/source/detail/device_impl.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp index e52c61d83fb04..a2129e57e8450 100644 --- a/sycl/source/detail/device_impl.hpp +++ b/sycl/source/detail/device_impl.hpp @@ -1444,6 +1444,8 @@ class device_impl : public std::enable_shared_from_this { (this->getBackend() == backend::ext_oneapi_cuda); } CASE(ext_oneapi_tangle) { + // TODO: tangle_group is not currently supported for CUDA devices. Add + // when implemented. return (this->getBackend() == backend::ext_oneapi_level_zero) || (this->getBackend() == backend::opencl); } From 4fbb2b81af066f98de5b54ca1681cf17fd7fb3d0 Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Wed, 2 Jul 2025 15:13:36 +0200 Subject: [PATCH 16/32] [SYCL] Formatting missed include line --- .../ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index da88ee3a0e811..c4e46e2e56691 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -9,8 +9,8 @@ #pragma once #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) -#include "masked_shuffles.hpp" #include "../non_uniform_groups.hpp" +#include "masked_shuffles.hpp" namespace sycl { inline namespace _V1 { From c4f9c371307064346494875f824f46f62a10a5be Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Wed, 2 Jul 2025 16:40:36 +0200 Subject: [PATCH 17/32] [SYCL] Fix include missing by CI --- .../ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index c4e46e2e56691..5142ae0710c24 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -9,8 +9,8 @@ #pragma once #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) -#include "../non_uniform_groups.hpp" #include "masked_shuffles.hpp" +#include namespace sycl { inline namespace _V1 { From 502ac668982cc8eab92f62cb619f98e182b44cb6 Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Wed, 2 Jul 2025 17:17:58 +0200 Subject: [PATCH 18/32] [SYCL] Fix missing namespace for is_chunk_v trait --- .../experimental/cuda/non_uniform_algorithms.hpp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index 5142ae0710c24..01280f175583c 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -16,6 +16,8 @@ namespace sycl { inline namespace _V1 { namespace detail { +using syclex = ext::oneapi::experimental; + template using IsRedux = std::bool_constant< std::is_integral::value && IsBitAND::value || @@ -98,7 +100,7 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, // chunk group reduction using shfls template -inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> +inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { @@ -111,9 +113,7 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, // Opportunistic/Ballot group reduction using shfls template inline __SYCL_ALWAYS_INLINE std::enable_if_t< - ext::oneapi::experimental::is_user_constructed_group_v && - !is_chunk_v, - T> + syclex::is_user_constructed_group_v && !syclex::is_chunk_v, T> masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -159,7 +159,7 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, template std::enable_if_t< std::is_same, std::false_type>::value && - ext::oneapi::experimental::is_user_constructed_group_v, + syclex::is_user_constructed_group_v, T> masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -212,7 +212,7 @@ inline __SYCL_ALWAYS_INLINE // chunk group scan using shfls template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation> -inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> +inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { unsigned localIdVal = g.get_local_id()[0]; @@ -233,9 +233,7 @@ masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation> inline __SYCL_ALWAYS_INLINE std::enable_if_t< - ext::oneapi::experimental::is_user_constructed_group_v && - !is_chunk_v, - T> + syclex::is_user_constructed_group_v && !syclex::is_chunk_v, T> masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { unsigned localIdVal = g.get_local_id()[0]; From 7639a7b657c7214dddd5b2ea519542031b3bc4fe Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Wed, 2 Jul 2025 20:29:32 +0200 Subject: [PATCH 19/32] [SYCL] Fixed wrong namespace ref --- .../ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index 01280f175583c..d1842fa41fb03 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -16,7 +16,7 @@ namespace sycl { inline namespace _V1 { namespace detail { -using syclex = ext::oneapi::experimental; +using syclex = ::sycl::ext::oneapi::experimental; template using IsRedux = std::bool_constant< From de45c927810b2fbf095b5acc647f3203962f71a6 Mon Sep 17 00:00:00 2001 From: Steffen Larsen Date: Thu, 24 Jul 2025 16:22:11 +0200 Subject: [PATCH 20/32] Update sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp --- .../ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index d1842fa41fb03..edad2a7ee063c 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -16,7 +16,7 @@ namespace sycl { inline namespace _V1 { namespace detail { -using syclex = ::sycl::ext::oneapi::experimental; +namespace syclex = sycl::ext::oneapi::experimental; template using IsRedux = std::bool_constant< From 62082cee2738bc0918d70d716aeb7dfc965bf961 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Thu, 24 Jul 2025 08:28:04 -0700 Subject: [PATCH 21/32] Remove redundant test requirements Signed-off-by: Larsen, Steffen --- sycl/test-e2e/NonUniformGroups/tangle.cpp | 1 - sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp | 6 ------ .../e2e_test_requirements/no-unsupported-without-info.cpp | 7 +------ 3 files changed, 1 insertion(+), 13 deletions(-) diff --git a/sycl/test-e2e/NonUniformGroups/tangle.cpp b/sycl/test-e2e/NonUniformGroups/tangle.cpp index 2736ddbc02024..cd7598685ed1d 100644 --- a/sycl/test-e2e/NonUniformGroups/tangle.cpp +++ b/sycl/test-e2e/NonUniformGroups/tangle.cpp @@ -8,7 +8,6 @@ // // REQUIRES: cpu || gpu // REQUIRES: aspect-ext_oneapi_tangle -// UNSUPPORTED: target-nvidia || target-amd #include #include diff --git a/sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp index 117c45ba34282..02decafb0734a 100644 --- a/sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp @@ -6,14 +6,8 @@ // RUN: %if any-device-is-cpu && opencl-aot %{ %{run-aux} %clangxx -fsycl -fsycl-targets=spir64_x86_64 -fno-sycl-early-optimizations -o %t.x86.out %s %} // RUN: %if cpu %{ %{run} %t.x86.out %} // -// REQUIRES: cpu || gpu // REQUIRES: sg-32 // REQUIRES: aspect-ext_oneapi_tangle -// UNSUPPORTED: target-amd -// UNSUPPORTED-INTENDED: tangle groups not available on amd -// UNSUPPORTED: target-nvidia || windows -// Tangle groups exhibit unpredictable behavior on Windows. -// The test is disabled while we investigate the root cause. #include #include diff --git a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp index 5038b480245a4..5eea752158c46 100644 --- a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp +++ b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp @@ -54,7 +54,7 @@ // tests to match the required format and in that case you should just update // (i.e. reduce) the number and the list below. // -// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 183 +// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 178 // // List of improperly UNSUPPORTED tests. // Remove the CHECK once the test has been properly UNSUPPORTED. @@ -193,11 +193,6 @@ // CHECK-NEXT: Matrix/joint_matrix_bf16_fill_k_cache_OOB.cpp // CHECK-NEXT: Matrix/joint_matrix_down_convert.cpp // CHECK-NEXT: Matrix/joint_matrix_rowmajorA_rowmajorB.cpp -// CHECK-NEXT: NonUniformGroups/chunk.cpp -// CHECK-NEXT: NonUniformGroups/fragment.cpp -// CHECK-NEXT: NonUniformGroups/opportunistic.cpp -// CHECK-NEXT: NonUniformGroups/tangle.cpp -// CHECK-NEXT: NonUniformGroups/tangle_algorithms.cpp // CHECK-NEXT: OptionalKernelFeatures/large-reqd-work-group-size.cpp // CHECK-NEXT: OptionalKernelFeatures/no-fp64-optimization-declared-aspects.cpp // CHECK-NEXT: Printf/char.cpp From 251f0b684ad2801d50d83ae4db35461b1eac98e1 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Thu, 24 Jul 2025 08:29:11 -0700 Subject: [PATCH 22/32] Remove unintentional tangle from CUDA min aspect requirements Signed-off-by: Larsen, Steffen --- llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td index 72d1d009a851f..dbfffb5d490eb 100644 --- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td +++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td @@ -283,7 +283,7 @@ defvar CudaMinAspects = !listconcat(CudaMinUSMAspects, [AspectGpu, AspectFp64, A AspectQueue_profiling, AspectExt_intel_pci_address, AspectExt_intel_memory_bus_width, AspectExt_intel_device_info_uuid, AspectExt_oneapi_native_assert, AspectExt_intel_free_memory, AspectExt_intel_device_id, AspectExt_intel_memory_clock_rate, AspectExt_oneapi_fragment, AspectExt_oneapi_chunk, - AspectExt_oneapi_tangle, AspectExt_oneapi_graph, AspectExt_oneapi_limited_graph]); + AspectExt_oneapi_graph, AspectExt_oneapi_limited_graph]); // Bindless images aspects are partially supported on CUDA and disabled by default at the moment. defvar CudaBindlessImagesAspects = [AspectExt_oneapi_bindless_images, AspectExt_oneapi_bindless_images_shared_usm, AspectExt_oneapi_bindless_images_1d_usm, AspectExt_oneapi_bindless_images_2d_usm, AspectExt_oneapi_external_memory_import, From a7ec62261a3d1ed48d64ab7996c81bd65a5da62b Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Thu, 24 Jul 2025 09:02:30 -0700 Subject: [PATCH 23/32] Remove unnecessary alias Signed-off-by: Larsen, Steffen --- .../cuda/non_uniform_algorithms.hpp | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index edad2a7ee063c..dca4623ed2bb4 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -16,8 +16,6 @@ namespace sycl { inline namespace _V1 { namespace detail { -namespace syclex = sycl::ext::oneapi::experimental; - template using IsRedux = std::bool_constant< std::is_integral::value && IsBitAND::value || @@ -100,9 +98,10 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, // chunk group reduction using shfls template -inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> -masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, - const uint32_t MemberMask) { +inline __SYCL_ALWAYS_INLINE + std::enable_if_t, T> + masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { T tmp = cuda_shfl_sync_bfly_i32(MemberMask, x, i, 0x1f); x = binary_op(x, tmp); @@ -113,7 +112,9 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, // Opportunistic/Ballot group reduction using shfls template inline __SYCL_ALWAYS_INLINE std::enable_if_t< - syclex::is_user_constructed_group_v && !syclex::is_chunk_v, T> + ext::oneapi::experimental::is_user_constructed_group_v && + !ext::oneapi::experimental::is_chunk_v, + T> masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -159,7 +160,7 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, template std::enable_if_t< std::is_same, std::false_type>::value && - syclex::is_user_constructed_group_v, + ext::oneapi::experimental::is_user_constructed_group_v, T> masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -212,9 +213,10 @@ inline __SYCL_ALWAYS_INLINE // chunk group scan using shfls template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation> -inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> -masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, - const uint32_t MemberMask) { +inline __SYCL_ALWAYS_INLINE + std::enable_if_t, T> + masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { unsigned localIdVal = g.get_local_id()[0]; for (int i = 1; i < g.get_local_range()[0]; i *= 2) { T tmp = cuda_shfl_sync_up_i32(MemberMask, x, i, 0); @@ -233,7 +235,9 @@ masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation> inline __SYCL_ALWAYS_INLINE std::enable_if_t< - syclex::is_user_constructed_group_v && !syclex::is_chunk_v, T> + ext::oneapi::experimental::is_user_constructed_group_v && + !ext::oneapi::experimental::is_chunk_v, + T> masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { unsigned localIdVal = g.get_local_id()[0]; From 1fbf327f1303108eee782df8eff5940cd729e4eb Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Thu, 24 Jul 2025 09:06:45 -0700 Subject: [PATCH 24/32] Add SYCL_EXT_ONEAPI_TANGLE Signed-off-by: Larsen, Steffen --- sycl/source/feature_test.hpp.in | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/source/feature_test.hpp.in b/sycl/source/feature_test.hpp.in index d407aab399bc8..1e26cf0b8a23e 100644 --- a/sycl/source/feature_test.hpp.in +++ b/sycl/source/feature_test.hpp.in @@ -123,6 +123,7 @@ inline namespace _V1 { #define SYCL_EXT_ONEAPI_ATOMIC16 0 #define SYCL_KHR_DEFAULT_CONTEXT 1 #define SYCL_EXT_INTEL_EVENT_MODE 1 +#define SYCL_EXT_ONEAPI_TANGLE 1 // Unfinished KHR extensions. These extensions are only available if the // __DPCPP_ENABLE_UNFINISHED_KHR_EXTENSIONS macro is defined. From f7e61843b4a24e504a5e57aefd0f3f12a9d92c68 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Fri, 25 Jul 2025 03:23:18 -0700 Subject: [PATCH 25/32] Add additional partition group support Signed-off-by: Larsen, Steffen --- .../sycl/ext/oneapi/experimental/chunk.hpp | 64 ++++++---- .../sycl/ext/oneapi/experimental/fragment.hpp | 32 ++--- .../experimental/non_uniform_groups.hpp | 17 +-- .../sycl/ext/oneapi/experimental/tangle.hpp | 10 +- .../sycl/ext/oneapi/sub_group_mask.hpp | 65 ++++++---- sycl/test-e2e/NonUniformGroups/chunk.cpp | 114 ++++++++++++++++-- .../NonUniformGroups/chunk_algorithms.cpp | 5 +- sycl/test-e2e/NonUniformGroups/fragment.cpp | 111 ++++++++++++++++- .../NonUniformGroups/fragment_algorithms.cpp | 3 +- .../NonUniformGroups/opportunistic.cpp | 5 +- .../opportunistic_algorithms.cpp | 3 +- sycl/test-e2e/NonUniformGroups/tangle.cpp | 3 +- .../NonUniformGroups/tangle_algorithms.cpp | 3 +- 13 files changed, 327 insertions(+), 108 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp b/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp index 46274ad33d8e9..96e92b89f05db 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp @@ -34,8 +34,8 @@ template #ifdef __SYCL_DEVICE_ONLY__ [[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_chunk)]] #endif -inline std::enable_if_t> && - std::is_same_v, +inline std::enable_if_t || + is_chunk_v, chunk> chunked_partition(ParentGroup parent); @@ -51,7 +51,11 @@ template class chunk { id_type get_group_id() const { #ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupLocalInvocationId() / ChunkSize; + if constexpr (is_chunk_v) + return __spirv_SubgroupLocalInvocationId() % ParentGroup::chunk_size / + ChunkSize; + else + return __spirv_SubgroupLocalInvocationId() / ChunkSize; #else return id_type(0); #endif @@ -67,7 +71,10 @@ template class chunk { range_type get_group_range() const { #ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupSize() / ChunkSize; + if constexpr (is_chunk_v) + return ParentGroup::chunk_size / ChunkSize; + else + return __spirv_SubgroupSize() / ChunkSize; #else return range_type(0); #endif @@ -122,14 +129,29 @@ template class chunk { } protected: + static constexpr size_t chunk_size = ChunkSize; + #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) sub_group_mask Mask; -#endif -#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) chunk(ext::oneapi::sub_group_mask mask) : Mask(mask) {} + + ext::oneapi::sub_group_mask getMask() const { return Mask; } #else chunk() {} + +#ifdef __SYCL_DEVICE_ONLY__ + ext::oneapi::sub_group_mask getMask() const { + ext::oneapi::sub_group_mask::BitsType MaskBits{0}; + MaskBits = ~MaskBits; + MaskBits >>= ext::oneapi::sub_group_mask::max_bits - ChunkSize; + MaskBits <<= + ext::oneapi::sub_group_mask::max_bits - + (((__spirv_SubgroupLocalInvocationId() / ChunkSize) + 1) * ChunkSize); + return sycl::detail::Builder::createSubGroupMask< + ext::oneapi::sub_group_mask>(MaskBits, __spirv_SubgroupMaxSize()); + } +#endif #endif friend chunk @@ -137,15 +159,19 @@ template class chunk { friend sub_group_mask sycl::detail::GetMask>( chunk Group); + + template + friend class chunk; }; // Chunked partition implementation template -inline std::enable_if_t> && - std::is_same_v, +inline std::enable_if_t || + is_chunk_v, chunk> -chunked_partition(ParentGroup parent) { - (void)parent; +chunked_partition([[maybe_unused]] ParentGroup parent) { + static_assert((ChunkSize & (ChunkSize - size_t{1})) == size_t{0}, + "ChunkSize must be a power of 2."); #ifdef __SYCL_DEVICE_ONLY__ // sync all work-items in parent group before partitioning sycl::group_barrier(parent); @@ -176,21 +202,8 @@ struct is_user_constructed_group> template struct is_chunk> : std::true_type {}; -} // namespace ext::oneapi::experimental - -template -struct is_group> - : std::true_type {}; - -} // namespace _V1 -} // namespace sycl - // chunk->fragment conversion // must be defined after fragment class is available -namespace sycl { -inline namespace _V1 { -namespace ext::oneapi::experimental { - template inline chunk::operator fragment() const { #ifdef __SYCL_DEVICE_ONLY__ @@ -216,5 +229,10 @@ inline chunk::operator fragment() const { } } // namespace ext::oneapi::experimental + +template +struct is_group> + : std::true_type {}; + } // namespace _V1 } // namespace sycl \ No newline at end of file diff --git a/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp b/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp index fc0dca7b2a395..19de9086f74f5 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp @@ -38,8 +38,9 @@ template #ifdef __SYCL_DEVICE_ONLY__ [[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_fragment)]] #endif -inline std::enable_if_t> && - std::is_same_v, +inline std::enable_if_t || + is_chunk_v || + is_fragment_v, fragment> binary_partition(ParentGroup parent, bool predicate); @@ -139,6 +140,8 @@ template class fragment { fragment(sub_group_mask m, id_type group_id, range_type group_range) : Mask(m), GroupID(group_id), GroupRange(group_range) {} + ext::oneapi::sub_group_mask getMask() const { return Mask; } + friend fragment binary_partition(ParentGroup parent, bool predicate); @@ -149,31 +152,28 @@ template class fragment { }; template -inline std::enable_if_t> && - std::is_same_v, +inline std::enable_if_t || + is_chunk_v || + is_fragment_v, fragment> -binary_partition(ParentGroup parent, bool predicate) { - (void)parent; +binary_partition([[maybe_unused]] ParentGroup parent, + [[maybe_unused]] bool predicate) { #ifdef __SYCL_DEVICE_ONLY__ // sync all work-items in parent group before partitioning sycl::group_barrier(parent); #if defined(__SPIR__) || defined(__SPIRV__) || defined(__NVPTX__) - sub_group_mask mask = sycl::ext::oneapi::group_ballot(parent, predicate); + // Collect ballot results. If we are in the false predicate group, the result + // should be inverted and filtered for the participants of the parent. + sub_group_mask mask = sycl::detail::commonGroupBallotImpl(parent, predicate); + if (!predicate) + mask = (~mask) & sycl::detail::GetMask(parent); + id<1> group_id = predicate ? 1 : 0; range<1> group_range = 2; // 2 groups based on predicate by binary_partition - - if (!predicate) { - sub_group_mask::BitsType participant_filter = - (~sub_group_mask::BitsType{0}) >> - (sub_group_mask::max_bits - parent.get_local_linear_range()); - mask = (~mask) & participant_filter; - } - return fragment(mask, group_id, group_range); #endif #else - (void)predicate; return fragment(sub_group_mask(), id<1>(0), range<1>(1)); #endif } diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index 98aa0dab2a071..e9fb839a77644 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -8,13 +8,13 @@ #pragma once -#include // for sub_group_mask -#include // for marray -#include // for vec +#include +#include +#include -#include // for size_t -#include // for uint32_t -#include // for false_type +#include +#include +#include namespace sycl { inline namespace _V1 { @@ -41,11 +41,6 @@ inline uint32_t CallerPositionInMask(ext::oneapi::sub_group_mask Mask) { } #endif -template -inline ext::oneapi::sub_group_mask GetMask(NonUniformGroup Group) { - return Group.Mask; -} - template inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) { sycl::vec MemberMask = ExtractMask(GetMask(Group)); diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp index 1cf4daab9f5f9..7856391004a14 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp @@ -30,8 +30,7 @@ template #ifdef __SYCL_DEVICE_ONLY__ [[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_tangle)]] #endif -inline std::enable_if_t> && - std::is_same_v, +inline std::enable_if_t, tangle> entangle(ParentGroup parent); template class tangle { @@ -118,10 +117,10 @@ template class tangle { protected: #ifdef __SYCL_DEVICE_ONLY__ sub_group_mask Mask; -#endif -#ifdef __SYCL_DEVICE_ONLY__ tangle(ext::oneapi::sub_group_mask mask) : Mask(mask) {} + + ext::oneapi::sub_group_mask getMask() const { return Mask; } #else tangle() {} #endif @@ -133,8 +132,7 @@ template class tangle { }; template -inline std::enable_if_t> && - std::is_same_v, +inline std::enable_if_t, tangle> entangle(ParentGroup parent) { (void)parent; diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp index 63e56327168e8..2c5c0174bcdd1 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp @@ -13,7 +13,8 @@ #include // for SYCL_EXT_ONEAPI_SUB_GROUP_MASK #include // for id #include // for marray -#include // for vec +#include +#include // for vec #include // for assert #include // for CHAR_BIT @@ -35,9 +36,6 @@ template struct group_scope; } // namespace detail -// forward decalre sycl::sub_group -struct sub_group; - namespace ext::oneapi { // forward decalre sycl::ext::oneapi::sub_group @@ -94,7 +92,7 @@ struct sub_group_mask { }; #if SYCL_EXT_ONEAPI_SUB_GROUP_MASK >= 2 - sub_group_mask() : sub_group_mask(0, GetMaxLocalRangeSize()){}; + sub_group_mask() : sub_group_mask(0, GetMaxLocalRangeSize()) {}; sub_group_mask(unsigned long long val) : sub_group_mask(0, GetMaxLocalRangeSize()) { @@ -111,7 +109,7 @@ struct sub_group_mask { size_t BytesToCopy = RemainingBytes < sizeof(T) ? RemainingBytes : sizeof(T); sycl::detail::memcpy_no_adl(reinterpret_cast(&Bits) + BytesCopied, - &val[I], BytesToCopy); + &val[I], BytesToCopy); BytesCopied += BytesToCopy; } } @@ -283,11 +281,6 @@ struct sub_group_mask { return Tmp; } - template - friend std::enable_if_t, sub_group>, - sub_group_mask> - group_ballot(Group g, bool predicate); - friend sub_group_mask operator&(const sub_group_mask &lhs, const sub_group_mask &rhs) { auto Res = lhs; @@ -338,22 +331,52 @@ struct sub_group_mask { size_t bits_num; }; +} // namespace ext::oneapi + +namespace detail { +template +ext::oneapi::sub_group_mask GetMask(NonUniformGroup Group) { + return Group.getMask(); +} + +template <> +ext::oneapi::sub_group_mask GetMask(sycl::sub_group Group) { + return (~ext::oneapi::sub_group_mask::BitsType{0}) >> + (ext::oneapi::sub_group_mask::max_bits - + Group.get_local_linear_range()); +} + +#ifdef __SYCL_DEVICE_ONLY__ +template +ext::oneapi::sub_group_mask commonGroupBallotImpl(Group G, bool Predicate) { + auto Res = __spirv_GroupNonUniformBallot( + sycl::detail::spirv::group_scope::value, Predicate); + ext::oneapi::sub_group_mask::BitsType Val = Res[0]; + if constexpr (sizeof(ext::oneapi::sub_group_mask::BitsType) == 8) + Val |= ((ext::oneapi::sub_group_mask::BitsType)Res[1]) << 32; + auto Mask = + sycl::detail::Builder::createSubGroupMask( + Val, __spirv_SubgroupMaxSize()); + // For sub-groups we do not need to apply the mask, but for others it will + // split converging groups accordingly. + if constexpr (!std::is_same_v, ext::oneapi::sub_group> && + !std::is_same_v, sycl::sub_group>) + Mask &= sycl::detail::GetMask(G); + return Mask; +} +#endif // __SYCL_DEVICE_ONLY__ +} // namespace detail + +namespace ext::oneapi { + template std::enable_if_t, sub_group> || std::is_same_v, sycl::sub_group>, sub_group_mask> -group_ballot(Group g, bool predicate) { - (void)g; +group_ballot([[maybe_unused]] Group g, [[maybe_unused]] bool predicate) { #ifdef __SYCL_DEVICE_ONLY__ - auto res = __spirv_GroupNonUniformBallot( - sycl::detail::spirv::group_scope::value, predicate); - sub_group_mask::BitsType val = res[0]; - if constexpr (sizeof(sub_group_mask::BitsType) == 8) - val |= ((sub_group_mask::BitsType)res[1]) << 32; - return sycl::detail::Builder::createSubGroupMask( - val, g.get_max_local_range()[0]); + return sycl::detail::commonGroupBallotImpl(g, predicate); #else - (void)predicate; throw exception{errc::feature_not_supported, "Sub-group mask is not supported on host device"}; #endif diff --git a/sycl/test-e2e/NonUniformGroups/chunk.cpp b/sycl/test-e2e/NonUniformGroups/chunk.cpp index c760f3ab0157c..b2b27495385a6 100644 --- a/sycl/test-e2e/NonUniformGroups/chunk.cpp +++ b/sycl/test-e2e/NonUniformGroups/chunk.cpp @@ -2,9 +2,8 @@ // RUN: %{run} %t.out // // CPU AOT targets host isa, so we compile on the run system instead. -// REQUIRES: opencl-aot // RUN: %if any-device-is-cpu && opencl-aot %{ %{run-aux} %clangxx -fsycl -fsycl-targets=spir64_x86_64 -o %t.x86.out %s %} -// RUN: %if cpu %{ %{run} %t.x86.out %} +// RUN: %if cpu && opencl-aot %{ %{run} %t.x86.out %} // // REQUIRES: cpu || gpu // REQUIRES: sg-32 @@ -15,7 +14,8 @@ namespace syclex = sycl::ext::oneapi::experimental; -template class TestKernel; +template class SubgroupTestKernel; +template class ChunkTestKernel; template void test() { sycl::queue Q; @@ -26,13 +26,13 @@ template void test() { if (WGS < PartitionSize) continue; - std::cout << "Testing for work size " << WGS << " and partition size " - << PartitionSize << std::endl; - sycl::buffer MatchBuf{sycl::range{WGS}}; sycl::buffer LeaderBuf{sycl::range{WGS}}; const auto NDR = sycl::nd_range<1>{WGS, WGS}; + + std::cout << "Testing for work size " << WGS << " and partition size " + << PartitionSize << std::endl; Q.submit([&](sycl::handler &CGH) { sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; @@ -52,14 +52,104 @@ template void test() { MatchAcc[WI] = Match; LeaderAcc[WI] = ChunkGroup.leader(); }; - CGH.parallel_for>(NDR, KernelFunc); + CGH.parallel_for>(NDR, KernelFunc); }); - sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; - sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; - for (int WI = 0; WI < WGS; ++WI) { - assert(MatchAcc[WI] == true); - assert(LeaderAcc[WI] == ((WI % PartitionSize) == 0)); + { + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_write}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_write}; + for (int WI = 0; WI < WGS; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == ((WI % PartitionSize) == 0)); + MatchAcc[WI] = false; + LeaderAcc[WI] = false; + } + } + + std::cout << "Testing for work size " << WGS << " and partition size " + << PartitionSize << " and subpartition size " << PartitionSize + << std::endl; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + auto ParentChunkGroup = + syclex::chunked_partition(SG); + auto ChunkGroup = + syclex::chunked_partition(ParentChunkGroup); + + bool Match = true; + Match &= (ChunkGroup.get_group_id() == 0); + Match &= (ChunkGroup.get_local_id() == (WI % PartitionSize)); + Match &= (ChunkGroup.get_group_range() == 1); + Match &= (ChunkGroup.get_local_range() == PartitionSize); + MatchAcc[WI] = Match; + LeaderAcc[WI] = ChunkGroup.leader(); + }; + CGH.parallel_for>( + NDR, KernelFunc); + }); + + { + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_write}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_write}; + for (int WI = 0; WI < WGS; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == ((WI % PartitionSize) == 0)); + MatchAcc[WI] = false; + LeaderAcc[WI] = false; + } + } + + constexpr size_t HalfPartitionSize = PartitionSize / 2; + if constexpr (HalfPartitionSize != 0) { + std::cout << "Testing for work size " << WGS << " and partition size " + << PartitionSize << " and subpartition size " + << HalfPartitionSize << std::endl; + const auto NDR = sycl::nd_range<1>{WGS, WGS}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + auto ParentChunkGroup = + syclex::chunked_partition(SG); + auto ChunkGroup = syclex::chunked_partition( + ParentChunkGroup); + + bool Match = true; + Match &= (ChunkGroup.get_group_id() == + (ParentChunkGroup.get_local_id() / HalfPartitionSize)); + Match &= (ChunkGroup.get_local_id() == + (ParentChunkGroup.get_local_id() % HalfPartitionSize)); + Match &= (ChunkGroup.get_group_range() == + (ParentChunkGroup.get_local_linear_range() / + HalfPartitionSize)); + Match &= (ChunkGroup.get_local_range() == HalfPartitionSize); + MatchAcc[WI] = Match; + LeaderAcc[WI] = ChunkGroup.leader(); + }; + CGH.parallel_for>( + NDR, KernelFunc); + }); + + { + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_write}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_write}; + for (int WI = 0; WI < WGS; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == ((WI % HalfPartitionSize) == 0)); + MatchAcc[WI] = false; + LeaderAcc[WI] = false; + } + } } } } diff --git a/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp index cc24fab7be4bd..6308ddbb11121 100644 --- a/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp @@ -1,10 +1,9 @@ // RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out // RUN: %{run} %t.out // -// CPU AOT targets host isa, so we compile on the run system instead. -// REQUIRES: opencl-aot +// CPU AOT targets host isa, so we compile on the run system. // RUN: %if any-device-is-cpu && opencl-aot %{ %{run-aux} %clangxx -fsycl -fsycl-targets=spir64_x86_64 -fsycl-device-code-split=per_kernel -o %t.x86.out %s %} -// RUN: %if cpu %{ %{run} %t.x86.out %} +// RUN: %if cpu && opencl-aot %{ %{run} %t.x86.out %} // // REQUIRES: cpu || gpu // REQUIRES: sg-32 diff --git a/sycl/test-e2e/NonUniformGroups/fragment.cpp b/sycl/test-e2e/NonUniformGroups/fragment.cpp index 47e2a3f88d7be..d31a7f860fc5d 100644 --- a/sycl/test-e2e/NonUniformGroups/fragment.cpp +++ b/sycl/test-e2e/NonUniformGroups/fragment.cpp @@ -2,19 +2,21 @@ // RUN: %{run} %t.out // // CPU AOT targets host isa, so we compile on the run system instead. -// REQUIRES: opencl-aot // RUN: %if any-device-is-cpu && opencl-aot %{ %{run-aux} %clangxx -fsycl -fsycl-targets=spir64_x86_64 -o %t.x86.out %s %} -// RUN: %if cpu %{ %{run} %t.x86.out %} +// RUN: %if cpu && opencl-aot %{ %{run} %t.x86.out %} // // REQUIRES: cpu || gpu // REQUIRES: aspect-ext_oneapi_fragment #include +#include #include #include namespace syclex = sycl::ext::oneapi::experimental; -class TestKernel; +class SubgroupTestKernel; +class FragmentTestKernel; +class ChunkTestKernel; int main() { sycl::queue Q; @@ -29,7 +31,8 @@ int main() { // Test for both the full sub-group size and a case with less work than a full // sub-group. for (size_t WGS : std::array{32, 16}) { - std::cout << "Testing for work size " << WGS << std::endl; + std::cout << "Testing sub_group partition for work size " << WGS + << std::endl; sycl::buffer MatchBuf{sycl::range{WGS}}; sycl::buffer LeaderBuf{sycl::range{WGS}}; @@ -61,7 +64,7 @@ int main() { MatchAcc[WI] = Match; LeaderAcc[WI] = FragmentGroup.leader(); }; - CGH.parallel_for(NDR, KernelFunc); + CGH.parallel_for(NDR, KernelFunc); }); sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; @@ -72,5 +75,103 @@ int main() { } } + // Test for fragment created from another fragment. + { + std::cout << "Testing for fragment from fragment" << std::endl; + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Split into odd and even work-items. + bool ParentPredicate = WI % 2 == 0; + auto ParentFragmentGroup = + syclex::binary_partition(SG, ParentPredicate); + + // Split parent fragment into odd and even participants. + bool Predicate = ParentFragmentGroup.get_local_linear_id() % 2 == 0; + auto FragmentGroup = + syclex::binary_partition(ParentFragmentGroup, Predicate); + + // Check function return values match Predicate and ParentPredicate. + bool Match = true; + auto GroupID = Predicate ? 1 : 0; + auto LocalID = ParentFragmentGroup.get_local_id() / 2; + Match &= (FragmentGroup.get_group_id() == GroupID); + Match &= (FragmentGroup.get_local_id() == LocalID); + Match &= (FragmentGroup.get_group_range() == 2); + Match &= (FragmentGroup.get_local_range() == + ParentFragmentGroup.get_local_linear_range() / 2); + MatchAcc[WI] = Match; + LeaderAcc[WI] = FragmentGroup.leader(); + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == (WI < 4)); + } + } + + // Test for fragment created from a chunk. + if (Q.get_device().has(sycl::aspect::ext_oneapi_chunk)) { + std::cout << "Testing for fragment from chunk" << std::endl; + + constexpr size_t ChunkSize = 8; + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Split into chunks. + auto ParentChunkGroup = syclex::chunked_partition(SG); + + // Split parent fragment into odd and even participants. + bool Predicate = ParentChunkGroup.get_local_linear_id() % 2 == 0; + auto FragmentGroup = + syclex::binary_partition(ParentChunkGroup, Predicate); + + // Check function return values match Predicate and ParentPredicate. + bool Match = true; + auto GroupID = Predicate ? 1 : 0; + auto LocalID = ParentChunkGroup.get_local_id() / 2; + Match &= (FragmentGroup.get_group_id() == GroupID); + Match &= (FragmentGroup.get_local_id() == LocalID); + Match &= (FragmentGroup.get_group_range() == 2); + Match &= (FragmentGroup.get_local_range() == + ParentChunkGroup.get_local_linear_range() / 2); + MatchAcc[WI] = Match; + LeaderAcc[WI] = FragmentGroup.leader(); + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == (WI % ChunkSize < 2)); + } + } + return 0; } diff --git a/sycl/test-e2e/NonUniformGroups/fragment_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/fragment_algorithms.cpp index f3cbb03d7acae..be3afb2ad7989 100644 --- a/sycl/test-e2e/NonUniformGroups/fragment_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/fragment_algorithms.cpp @@ -2,9 +2,8 @@ // RUN: %{run} %t.out // // CPU AOT targets host isa, so we compile on the run system instead. -// REQUIRES: opencl-aot // RUN: %if any-device-is-cpu && opencl-aot %{ %{run-aux} %clangxx -fsycl -fsycl-targets=spir64_x86_64 -o %t.x86.out %s %} -// RUN: %if cpu %{ %{run} %t.x86.out %} +// RUN: %if cpu && opencl-aot %{ %{run} %t.x86.out %} // // REQUIRES: cpu || gpu // REQUIRES: sg-32 diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic.cpp index 730148275212c..c122f845d8230 100644 --- a/sycl/test-e2e/NonUniformGroups/opportunistic.cpp +++ b/sycl/test-e2e/NonUniformGroups/opportunistic.cpp @@ -1,10 +1,9 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out // -// CPU AOT targets host isa - compile on run system instead -// REQUIRES: opencl-aot +// CPU AOT targets host isa - compile on run system // RUN: %if any-device-is-cpu && opencl-aot %{ %{run-aux} %clangxx -fsycl -fsycl-targets=spir64_x86_64 -o %t.x86.out %s %} -// RUN: %if cpu %{ %{run} %t.x86.out %} +// RUN: %if cpu && opencl-aot %{ %{run} %t.x86.out %} // // REQUIRES: cpu || gpu // REQUIRES: aspect-ext_oneapi_fragment diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_algorithms.cpp index 5cc65cde17797..6c8f53275b994 100644 --- a/sycl/test-e2e/NonUniformGroups/opportunistic_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/opportunistic_algorithms.cpp @@ -2,9 +2,8 @@ // RUN: %{run} %t.out // // CPU AOT targets host isa, so we compile on the run system instead. -// REQUIRES: opencl-aot // RUN: %if any-device-is-cpu && opencl-aot %{ %{run-aux} %clangxx -fsycl -fsycl-targets=spir64_x86_64 -o %t.x86.out %s %} -// RUN: %if cpu %{ %{run} %t.x86.out %} +// RUN: %if cpu && opencl-aot %{ %{run} %t.x86.out %} // // REQUIRES: cpu || gpu // REQUIRES: sg-32 diff --git a/sycl/test-e2e/NonUniformGroups/tangle.cpp b/sycl/test-e2e/NonUniformGroups/tangle.cpp index cd7598685ed1d..d6afbe1fbcbf5 100644 --- a/sycl/test-e2e/NonUniformGroups/tangle.cpp +++ b/sycl/test-e2e/NonUniformGroups/tangle.cpp @@ -2,9 +2,8 @@ // RUN: %{run} %t.out // // CPU AOT targets host isa, so we compile on the run system instead. -// REQUIRES: opencl-aot // RUN: %if any-device-is-cpu && opencl-aot %{ %{run-aux} %clangxx -fsycl -fsycl-targets=spir64_x86_64 -fno-sycl-early-optimizations -o %t.x86.out %s %} -// RUN: %if cpu %{ %{run} %t.x86.out %} +// RUN: %if cpu && opencl-aot %{ %{run} %t.x86.out %} // // REQUIRES: cpu || gpu // REQUIRES: aspect-ext_oneapi_tangle diff --git a/sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp index 02decafb0734a..71509706c089f 100644 --- a/sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp @@ -2,9 +2,8 @@ // RUN: %{run} %t.out // // CPU AOT targets host isa, so we compile on the run system instead. -// REQUIRES: opencl-aot // RUN: %if any-device-is-cpu && opencl-aot %{ %{run-aux} %clangxx -fsycl -fsycl-targets=spir64_x86_64 -fno-sycl-early-optimizations -o %t.x86.out %s %} -// RUN: %if cpu %{ %{run} %t.x86.out %} +// RUN: %if cpu && opencl-aot %{ %{run} %t.x86.out %} // // REQUIRES: sg-32 // REQUIRES: aspect-ext_oneapi_tangle From 843df189e7e4ed8d54149889906061c3bfc27d60 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Fri, 25 Jul 2025 04:09:47 -0700 Subject: [PATCH 26/32] Address small comments Signed-off-by: Larsen, Steffen --- .../sycl/ext/oneapi/experimental/chunk.hpp | 51 +++++++++---------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp b/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp index 96e92b89f05db..88aee542bb17e 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp @@ -47,7 +47,28 @@ template class chunk { static constexpr int dimensions = 1; static constexpr sycl::memory_scope fence_scope = ParentGroup::fence_scope; - inline operator fragment() const; + inline operator fragment() const { +#ifdef __SYCL_DEVICE_ONLY__ +#if defined(__NVPTX__) + // make fragment from chunk's mask and properties + return fragment(Mask, get_group_id(), get_group_range()); +#else + // or mask based on chunk membership for non-NVPTX devices + uint32_t loc_id = __spirv_SubgroupLocalInvocationId(); + uint32_t chunk_start = (loc_id / ChunkSize) * ChunkSize; + sub_group_mask::BitsType bits = + ChunkSize == 32 + ? sub_group_mask::BitsType(~0) + : ((sub_group_mask::BitsType(1) << ChunkSize) - 1) << chunk_start; + sub_group_mask mask = + sycl::detail::Builder::createSubGroupMask( + bits, __spirv_SubgroupSize()); + return fragment(mask, get_group_id(), get_group_range()); +#endif +#else + return fragment(); +#endif + } id_type get_group_id() const { #ifdef __SYCL_DEVICE_ONLY__ @@ -202,32 +223,6 @@ struct is_user_constructed_group> template struct is_chunk> : std::true_type {}; -// chunk->fragment conversion -// must be defined after fragment class is available -template -inline chunk::operator fragment() const { -#ifdef __SYCL_DEVICE_ONLY__ -#if defined(__NVPTX__) - // make fragment from chunk's mask and properties - return fragment(Mask, get_group_id(), get_group_range()); -#else - // or mask based on chunk membership for non-NVPTX devices - uint32_t loc_id = __spirv_SubgroupLocalInvocationId(); - uint32_t chunk_start = (loc_id / ChunkSize) * ChunkSize; - sub_group_mask::BitsType bits = - ChunkSize == 32 - ? sub_group_mask::BitsType(~0) - : ((sub_group_mask::BitsType(1) << ChunkSize) - 1) << chunk_start; - sub_group_mask mask = - sycl::detail::Builder::createSubGroupMask( - bits, __spirv_SubgroupSize()); - return fragment(mask, get_group_id(), get_group_range()); -#endif -#else - return fragment(); -#endif -} - } // namespace ext::oneapi::experimental template @@ -235,4 +230,4 @@ struct is_group> : std::true_type {}; } // namespace _V1 -} // namespace sycl \ No newline at end of file +} // namespace sycl From ede8a483f79aaa8e0cec14d7bc3360780ff4e636 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Fri, 25 Jul 2025 04:45:14 -0700 Subject: [PATCH 27/32] Fix build failure and attempt to fix new test Signed-off-by: Larsen, Steffen --- sycl/include/sycl/ext/oneapi/experimental/chunk.hpp | 4 ++-- sycl/include/sycl/ext/oneapi/sub_group_mask.hpp | 3 ++- sycl/test-e2e/NonUniformGroups/fragment.cpp | 5 ++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp b/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp index 88aee542bb17e..9cee4f537f45c 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp @@ -165,8 +165,8 @@ template class chunk { ext::oneapi::sub_group_mask getMask() const { ext::oneapi::sub_group_mask::BitsType MaskBits{0}; MaskBits = ~MaskBits; - MaskBits >>= ext::oneapi::sub_group_mask::max_bits - ChunkSize; - MaskBits <<= + MaskBits <<= ext::oneapi::sub_group_mask::max_bits - ChunkSize; + MaskBits >>= ext::oneapi::sub_group_mask::max_bits - (((__spirv_SubgroupLocalInvocationId() / ChunkSize) + 1) * ChunkSize); return sycl::detail::Builder::createSubGroupMask< diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp index 2c5c0174bcdd1..edd7287e9c06b 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp @@ -340,7 +340,8 @@ ext::oneapi::sub_group_mask GetMask(NonUniformGroup Group) { } template <> -ext::oneapi::sub_group_mask GetMask(sycl::sub_group Group) { +inline ext::oneapi::sub_group_mask +GetMask(sycl::sub_group Group) { return (~ext::oneapi::sub_group_mask::BitsType{0}) >> (ext::oneapi::sub_group_mask::max_bits - Group.get_local_linear_range()); diff --git a/sycl/test-e2e/NonUniformGroups/fragment.cpp b/sycl/test-e2e/NonUniformGroups/fragment.cpp index d31a7f860fc5d..a26ab28e1fbba 100644 --- a/sycl/test-e2e/NonUniformGroups/fragment.cpp +++ b/sycl/test-e2e/NonUniformGroups/fragment.cpp @@ -150,15 +150,14 @@ int main() { auto FragmentGroup = syclex::binary_partition(ParentChunkGroup, Predicate); - // Check function return values match Predicate and ParentPredicate. + // Check function return values match Predicate. bool Match = true; auto GroupID = Predicate ? 1 : 0; auto LocalID = ParentChunkGroup.get_local_id() / 2; Match &= (FragmentGroup.get_group_id() == GroupID); Match &= (FragmentGroup.get_local_id() == LocalID); Match &= (FragmentGroup.get_group_range() == 2); - Match &= (FragmentGroup.get_local_range() == - ParentChunkGroup.get_local_linear_range() / 2); + Match &= (FragmentGroup.get_local_range() == ChunkSize / 2); MatchAcc[WI] = Match; LeaderAcc[WI] = FragmentGroup.leader(); }; From e91634626c6a11f2a8280fc339ee2fe6f414f813 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Mon, 28 Jul 2025 05:13:53 -0700 Subject: [PATCH 28/32] Clean up is_* traits Signed-off-by: Larsen, Steffen --- sycl/include/sycl/detail/spirv.hpp | 39 ++++--------------- sycl/include/sycl/detail/type_traits.hpp | 23 +++++++++++ .../sycl/ext/oneapi/experimental/chunk.hpp | 11 ++---- .../cuda/non_uniform_algorithms.hpp | 18 ++++----- .../sycl/ext/oneapi/experimental/fragment.hpp | 11 ++---- .../experimental/non_uniform_groups.hpp | 24 ------------ .../sycl/ext/oneapi/experimental/tangle.hpp | 3 -- sycl/test/non-uniform-groups/type_traits.cpp | 31 --------------- 8 files changed, 47 insertions(+), 113 deletions(-) delete mode 100644 sycl/test/non-uniform-groups/type_traits.cpp diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index ddbc6ab3ed8b6..e12d7785d9739 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -37,9 +37,6 @@ namespace experimental { template class fragment; template class root_group; -template class tangle; -template class fragment; -template class chunk; } // namespace experimental } // namespace oneapi } // namespace ext @@ -67,26 +64,6 @@ inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id); namespace spirv { -template struct is_tangle_group : std::false_type {}; - -template -struct is_tangle_group> - : std::true_type {}; - -template struct is_ballot_group : std::false_type {}; - -template struct is_fragment : std::false_type {}; - -template -struct is_fragment> - : std::true_type {}; - -template struct is_chunk : std::false_type {}; - -template -struct is_chunk> - : std::true_type {}; - template struct group_scope {}; template @@ -902,10 +879,10 @@ inline uint32_t membermask() { template inline uint32_t MapShuffleID(GroupT g, id<1> local_id) { - if constexpr (is_tangle_group::value || - is_ballot_group::value || is_fragment::value) + if constexpr (sycl::detail::is_tangle_v || + sycl::detail::is_fragment_v) return detail::IdToMaskPosition(g, local_id); - else if constexpr (is_chunk::value) + else if constexpr (sycl::detail::is_chunk_v) return g.get_group_linear_id() * g.get_local_range().size() + local_id; else return local_id.get(0); @@ -1001,7 +978,7 @@ EnableIfNativeShuffle ShuffleXor(GroupT g, T x, id<1> mask) { if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< GroupT>) { auto MemberMask = detail::ExtractMask(detail::GetMask(g))[0]; - if constexpr (is_chunk::value) { + if constexpr (sycl::detail::is_chunk_v) { return cuda_shfl_sync_bfly_i32(MemberMask, x, static_cast(mask.get(0)), 0x1f); } else { @@ -1048,7 +1025,7 @@ EnableIfNativeShuffle ShuffleDown(GroupT g, T x, uint32_t delta) { if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< GroupT>) { auto MemberMask = detail::ExtractMask(detail::GetMask(g))[0]; - if constexpr (is_chunk::value) { + if constexpr (sycl::detail::is_chunk_v) { return cuda_shfl_sync_down_i32(MemberMask, x, delta, 31); } else { unsigned localSetBit = g.get_local_id()[0] + 1; @@ -1092,7 +1069,7 @@ EnableIfNativeShuffle ShuffleUp(GroupT g, T x, uint32_t delta) { if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< GroupT>) { auto MemberMask = detail::ExtractMask(detail::GetMask(g))[0]; - if constexpr (is_chunk::value) { + if constexpr (sycl::detail::is_chunk_v) { return cuda_shfl_sync_up_i32(MemberMask, x, delta, 0); } else { unsigned localSetBit = g.get_local_id()[0] + 1; @@ -1357,8 +1334,8 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { #define __SYCL_GROUP_COLLECTIVE_TANGLE(Instruction, GroupExt) \ template <__spv::GroupOperation Op, typename Group, typename T> \ - inline typename std::enable_if_t::value, T> \ - Group##Instruction(Group, T x) { \ + inline typename std::enable_if_t, T> \ + Group##Instruction(Group, T x) { \ using ConvertedT = detail::ConvertToOpenCLType_t; \ \ using OCLT = std::conditional_t< \ diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp index edade60d190cf..69c30260f21de 100644 --- a/sycl/include/sycl/detail/type_traits.hpp +++ b/sycl/include/sycl/detail/type_traits.hpp @@ -51,6 +51,10 @@ template inline constexpr bool is_user_constructed_group_v = is_user_constructed_group::value; +template class tangle; +template class fragment; +template class chunk; + namespace detail { template struct is_group_helper : std::false_type {}; @@ -79,6 +83,25 @@ struct is_generic_group template inline constexpr bool is_generic_group_v = is_generic_group::value; +template struct is_tangle : std::false_type {}; +template +struct is_tangle> + : std::true_type {}; +template constexpr bool is_tangle_v = is_tangle::value; + +template struct is_fragment : std::false_type {}; +template +struct is_fragment> + : std::true_type {}; +template +constexpr bool is_fragment_v = is_fragment::value; + +template struct is_chunk : std::false_type {}; +template +struct is_chunk> + : std::true_type {}; +template constexpr bool is_chunk_v = is_chunk::value; + namespace half_impl { class half; } diff --git a/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp b/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp index 9cee4f537f45c..56ed37c9852fb 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp @@ -35,7 +35,7 @@ template [[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_chunk)]] #endif inline std::enable_if_t || - is_chunk_v, + sycl::detail::is_chunk_v, chunk> chunked_partition(ParentGroup parent); @@ -72,7 +72,7 @@ template class chunk { id_type get_group_id() const { #ifdef __SYCL_DEVICE_ONLY__ - if constexpr (is_chunk_v) + if constexpr (sycl::detail::is_chunk_v) return __spirv_SubgroupLocalInvocationId() % ParentGroup::chunk_size / ChunkSize; else @@ -92,7 +92,7 @@ template class chunk { range_type get_group_range() const { #ifdef __SYCL_DEVICE_ONLY__ - if constexpr (is_chunk_v) + if constexpr (sycl::detail::is_chunk_v) return ParentGroup::chunk_size / ChunkSize; else return __spirv_SubgroupSize() / ChunkSize; @@ -188,7 +188,7 @@ template class chunk { // Chunked partition implementation template inline std::enable_if_t || - is_chunk_v, + sycl::detail::is_chunk_v, chunk> chunked_partition([[maybe_unused]] ParentGroup parent) { static_assert((ChunkSize & (ChunkSize - size_t{1})) == size_t{0}, @@ -220,9 +220,6 @@ template struct is_user_constructed_group> : std::true_type {}; -template -struct is_chunk> : std::true_type {}; - } // namespace ext::oneapi::experimental template diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index dca4623ed2bb4..1c090def4b8d5 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -98,10 +98,9 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, // chunk group reduction using shfls template -inline __SYCL_ALWAYS_INLINE - std::enable_if_t, T> - masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, - const uint32_t MemberMask) { +inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> +masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { T tmp = cuda_shfl_sync_bfly_i32(MemberMask, x, i, 0x1f); x = binary_op(x, tmp); @@ -113,7 +112,7 @@ inline __SYCL_ALWAYS_INLINE template inline __SYCL_ALWAYS_INLINE std::enable_if_t< ext::oneapi::experimental::is_user_constructed_group_v && - !ext::oneapi::experimental::is_chunk_v, + !sycl::detail::is_chunk_v, T> masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -213,10 +212,9 @@ inline __SYCL_ALWAYS_INLINE // chunk group scan using shfls template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation> -inline __SYCL_ALWAYS_INLINE - std::enable_if_t, T> - masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, - const uint32_t MemberMask) { +inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> +masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { unsigned localIdVal = g.get_local_id()[0]; for (int i = 1; i < g.get_local_range()[0]; i *= 2) { T tmp = cuda_shfl_sync_up_i32(MemberMask, x, i, 0); @@ -236,7 +234,7 @@ template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation> inline __SYCL_ALWAYS_INLINE std::enable_if_t< ext::oneapi::experimental::is_user_constructed_group_v && - !ext::oneapi::experimental::is_chunk_v, + !sycl::detail::is_chunk_v, T> masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { diff --git a/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp b/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp index 19de9086f74f5..7136f48183dc0 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp @@ -39,8 +39,8 @@ template [[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_fragment)]] #endif inline std::enable_if_t || - is_chunk_v || - is_fragment_v, + sycl::detail::is_chunk_v || + sycl::detail::is_fragment_v, fragment> binary_partition(ParentGroup parent, bool predicate); @@ -153,8 +153,8 @@ template class fragment { template inline std::enable_if_t || - is_chunk_v || - is_fragment_v, + sycl::detail::is_chunk_v || + sycl::detail::is_fragment_v, fragment> binary_partition([[maybe_unused]] ParentGroup parent, [[maybe_unused]] bool predicate) { @@ -204,9 +204,6 @@ inline fragment get_opportunistic_group() { template struct is_user_constructed_group> : std::true_type {}; -template -struct is_fragment> : std::true_type {}; - } // namespace ext::oneapi::experimental template diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index e9fb839a77644..81ceb044eeb73 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -64,29 +64,5 @@ inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) { } } // namespace detail - -namespace ext::oneapi::experimental { - -// Forward declarations of non-uniform group types for algorithm definitions -template class fragment; -template class chunk; -template class tangle; - -// Type trait helpers -template struct is_chunk : std::false_type {}; - -template inline constexpr bool is_chunk_v = is_chunk::value; - -template struct is_fragment : std::false_type {}; - -template -inline constexpr bool is_fragment_v = is_fragment::value; - -template struct is_tangle : std::false_type {}; - -template inline constexpr bool is_tangle_v = is_tangle::value; - -} // namespace ext::oneapi::experimental - } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp index 7856391004a14..7da9a5c30daad 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp @@ -160,9 +160,6 @@ entangle(ParentGroup parent) { template struct is_user_constructed_group> : std::true_type {}; -template -struct is_tangle> : std::true_type {}; - } // namespace ext::oneapi::experimental template diff --git a/sycl/test/non-uniform-groups/type_traits.cpp b/sycl/test/non-uniform-groups/type_traits.cpp deleted file mode 100644 index 46c901d072432..0000000000000 --- a/sycl/test/non-uniform-groups/type_traits.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// RUN: %clangxx -fsycl -fsyntax-only %s - -#include -#include -#include - -namespace syclex = sycl::ext::oneapi::experimental; - -// check each trait correctly identifies own type -static_assert(syclex::is_chunk_v>); -static_assert(syclex::is_fragment_v>); -static_assert(syclex::is_tangle_v>); - -// check traits return false for different group types (cross-check) -static_assert(!syclex::is_chunk_v>); -static_assert(!syclex::is_chunk_v>); -static_assert(!syclex::is_fragment_v>); -static_assert(!syclex::is_fragment_v>); -static_assert(!syclex::is_tangle_v>); -static_assert(!syclex::is_tangle_v>); - -// check traits return false for base group types -static_assert(!syclex::is_chunk_v); -static_assert(!syclex::is_fragment_v); -static_assert(!syclex::is_tangle_v); - -// chunk sizes -static_assert(syclex::is_chunk_v>); -static_assert(syclex::is_chunk_v>); - -// these traits are used in spirv.hpp MapShuffleID() for dispatch From e3e556a3da9fbe2a02a8087df32725db8e543477 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Mon, 28 Jul 2025 05:18:23 -0700 Subject: [PATCH 29/32] Remove last (void) cast Signed-off-by: Larsen, Steffen --- sycl/include/sycl/ext/oneapi/experimental/tangle.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp index 7da9a5c30daad..e57cc6f82c210 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp @@ -134,8 +134,7 @@ template class tangle { template inline std::enable_if_t, tangle> -entangle(ParentGroup parent) { - (void)parent; +entangle([[maybe_unused]] ParentGroup parent) { #ifdef __SYCL_DEVICE_ONLY__ // sync all work-items in parent group here sycl::group_barrier(parent); From 3889570179e265aaabceeb9a4485e3ea3f758816 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Mon, 28 Jul 2025 05:23:22 -0700 Subject: [PATCH 30/32] Adhere to weird formatter requirement Signed-off-by: Larsen, Steffen --- sycl/include/sycl/detail/spirv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index e12d7785d9739..b11908fce90f0 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -1335,7 +1335,7 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { #define __SYCL_GROUP_COLLECTIVE_TANGLE(Instruction, GroupExt) \ template <__spv::GroupOperation Op, typename Group, typename T> \ inline typename std::enable_if_t, T> \ - Group##Instruction(Group, T x) { \ + Group##Instruction(Group, T x) { \ using ConvertedT = detail::ConvertToOpenCLType_t; \ \ using OCLT = std::conditional_t< \ From 6b9ed7487ff7bc55508edade91f67d40d5f77974 Mon Sep 17 00:00:00 2001 From: Steffen Larsen Date: Tue, 29 Jul 2025 16:57:47 +0200 Subject: [PATCH 31/32] Update sycl/include/sycl/ext/oneapi/experimental/fragment.hpp Co-authored-by: Udit Kumar Agarwal --- sycl/include/sycl/ext/oneapi/experimental/fragment.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp b/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp index 7136f48183dc0..c1d5af9cde51d 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/fragment.hpp @@ -211,4 +211,4 @@ struct is_group> : std::true_type {}; } // namespace _V1 -} // namespace sycl \ No newline at end of file +} // namespace sycl From 4d18c85f32760b36bc55c6b22a8d52bd7b7b83b2 Mon Sep 17 00:00:00 2001 From: Steffen Larsen Date: Tue, 29 Jul 2025 17:23:15 +0200 Subject: [PATCH 32/32] Apply suggestions from code review --- sycl/test-e2e/NonUniformGroups/chunk.cpp | 2 +- sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp | 2 +- sycl/test-e2e/NonUniformGroups/fragment.cpp | 2 +- sycl/test-e2e/NonUniformGroups/fragment_algorithms.cpp | 2 +- sycl/test-e2e/NonUniformGroups/opportunistic.cpp | 2 +- sycl/test-e2e/NonUniformGroups/opportunistic_algorithms.cpp | 2 +- sycl/test-e2e/NonUniformGroups/tangle.cpp | 2 +- sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sycl/test-e2e/NonUniformGroups/chunk.cpp b/sycl/test-e2e/NonUniformGroups/chunk.cpp index b2b27495385a6..7b2e875913a1a 100644 --- a/sycl/test-e2e/NonUniformGroups/chunk.cpp +++ b/sycl/test-e2e/NonUniformGroups/chunk.cpp @@ -1,7 +1,7 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out // -// CPU AOT targets host isa, so we compile on the run system instead. +// Test CPU AOT as well when possible. // RUN: %if any-device-is-cpu && opencl-aot %{ %{run-aux} %clangxx -fsycl -fsycl-targets=spir64_x86_64 -o %t.x86.out %s %} // RUN: %if cpu && opencl-aot %{ %{run} %t.x86.out %} // diff --git a/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp index 6308ddbb11121..3c29b07c18d48 100644 --- a/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp @@ -1,7 +1,7 @@ // RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out // RUN: %{run} %t.out // -// CPU AOT targets host isa, so we compile on the run system. +// Test CPU AOT as well when possible. // RUN: %if any-device-is-cpu && opencl-aot %{ %{run-aux} %clangxx -fsycl -fsycl-targets=spir64_x86_64 -fsycl-device-code-split=per_kernel -o %t.x86.out %s %} // RUN: %if cpu && opencl-aot %{ %{run} %t.x86.out %} // diff --git a/sycl/test-e2e/NonUniformGroups/fragment.cpp b/sycl/test-e2e/NonUniformGroups/fragment.cpp index a26ab28e1fbba..62a0bb2a5b1dc 100644 --- a/sycl/test-e2e/NonUniformGroups/fragment.cpp +++ b/sycl/test-e2e/NonUniformGroups/fragment.cpp @@ -1,7 +1,7 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out // -// CPU AOT targets host isa, so we compile on the run system instead. +// Test CPU AOT as well when possible. // RUN: %if any-device-is-cpu && opencl-aot %{ %{run-aux} %clangxx -fsycl -fsycl-targets=spir64_x86_64 -o %t.x86.out %s %} // RUN: %if cpu && opencl-aot %{ %{run} %t.x86.out %} // diff --git a/sycl/test-e2e/NonUniformGroups/fragment_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/fragment_algorithms.cpp index be3afb2ad7989..4c7093127c02a 100644 --- a/sycl/test-e2e/NonUniformGroups/fragment_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/fragment_algorithms.cpp @@ -1,7 +1,7 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out // -// CPU AOT targets host isa, so we compile on the run system instead. +// Test CPU AOT as well when possible. // RUN: %if any-device-is-cpu && opencl-aot %{ %{run-aux} %clangxx -fsycl -fsycl-targets=spir64_x86_64 -o %t.x86.out %s %} // RUN: %if cpu && opencl-aot %{ %{run} %t.x86.out %} // diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic.cpp index c122f845d8230..9db60ed7f6a8b 100644 --- a/sycl/test-e2e/NonUniformGroups/opportunistic.cpp +++ b/sycl/test-e2e/NonUniformGroups/opportunistic.cpp @@ -1,7 +1,7 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out // -// CPU AOT targets host isa - compile on run system +// Test CPU AOT as well when possible. // RUN: %if any-device-is-cpu && opencl-aot %{ %{run-aux} %clangxx -fsycl -fsycl-targets=spir64_x86_64 -o %t.x86.out %s %} // RUN: %if cpu && opencl-aot %{ %{run} %t.x86.out %} // diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_algorithms.cpp index 6c8f53275b994..7ba4f68c9db93 100644 --- a/sycl/test-e2e/NonUniformGroups/opportunistic_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/opportunistic_algorithms.cpp @@ -1,7 +1,7 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out // -// CPU AOT targets host isa, so we compile on the run system instead. +// Test CPU AOT as well when possible. // RUN: %if any-device-is-cpu && opencl-aot %{ %{run-aux} %clangxx -fsycl -fsycl-targets=spir64_x86_64 -o %t.x86.out %s %} // RUN: %if cpu && opencl-aot %{ %{run} %t.x86.out %} // diff --git a/sycl/test-e2e/NonUniformGroups/tangle.cpp b/sycl/test-e2e/NonUniformGroups/tangle.cpp index d6afbe1fbcbf5..2bface460eec7 100644 --- a/sycl/test-e2e/NonUniformGroups/tangle.cpp +++ b/sycl/test-e2e/NonUniformGroups/tangle.cpp @@ -1,7 +1,7 @@ // RUN: %{build} -fno-sycl-early-optimizations -o %t.out // RUN: %{run} %t.out // -// CPU AOT targets host isa, so we compile on the run system instead. +// Test CPU AOT as well when possible. // RUN: %if any-device-is-cpu && opencl-aot %{ %{run-aux} %clangxx -fsycl -fsycl-targets=spir64_x86_64 -fno-sycl-early-optimizations -o %t.x86.out %s %} // RUN: %if cpu && opencl-aot %{ %{run} %t.x86.out %} // diff --git a/sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp index 71509706c089f..e9b617893f72c 100644 --- a/sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp @@ -1,7 +1,7 @@ // RUN: %{build} -fno-sycl-early-optimizations -o %t.out // RUN: %{run} %t.out // -// CPU AOT targets host isa, so we compile on the run system instead. +// Test CPU AOT as well when possible. // RUN: %if any-device-is-cpu && opencl-aot %{ %{run-aux} %clangxx -fsycl -fsycl-targets=spir64_x86_64 -fno-sycl-early-optimizations -o %t.x86.out %s %} // RUN: %if cpu && opencl-aot %{ %{run} %t.x86.out %} //