diff --git a/unified-runtime/scripts/core/LEVEL_ZERO.rst b/unified-runtime/scripts/core/LEVEL_ZERO.rst index e5e33e2a5b79b..c8c57d6e7263e 100644 --- a/unified-runtime/scripts/core/LEVEL_ZERO.rst +++ b/unified-runtime/scripts/core/LEVEL_ZERO.rst @@ -146,6 +146,12 @@ Environment Variables | | The wait-event path relies on | the immediate append path only for some devices when the | | | | zeCommandQueueExecuteCommandLists() | pre-requisites are met. | | +---------------------------------------------+--------------------------------------------------------------+--------------------------------------------------------------+------------------+ +| UR_L0_VECTOR_WIDTH_SIZE | Specifies the size (in bits) of the vector width supported | Any positive integer: Indicates the maximum number of data | Device-specific | +| | by the Level Zero device. This value indicates the maximum | elements that can be processed simultaneously in a single | | +| | number of data elements that can be processed simultaneously | instruction. The value entered by user user is the desired | | +| | in a single instruction, which is useful for optimizing | width size to config. If this width size is not supported, | | +| | data-parallel workloads and understanding device caps. | then the default "max" will be used. | | ++---------------------------------------------+--------------------------------------------------------------+--------------------------------------------------------------+------------------+ Contributors ------------ diff --git a/unified-runtime/source/adapters/level_zero/common.cpp b/unified-runtime/source/adapters/level_zero/common.cpp index bac059c46f170..56a12f077122e 100644 --- a/unified-runtime/source/adapters/level_zero/common.cpp +++ b/unified-runtime/source/adapters/level_zero/common.cpp @@ -303,6 +303,11 @@ ze_structure_type_t getZeStructureType() { return ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT; } template <> +ze_structure_type_t +getZeStructureType() { + return ZE_STRUCTURE_TYPE_DEVICE_VECTOR_WIDTH_PROPERTIES_EXT; +} +template <> ze_structure_type_t getZeStructureType() { return ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES; } diff --git a/unified-runtime/source/adapters/level_zero/common.hpp b/unified-runtime/source/adapters/level_zero/common.hpp index 33a1072e217a9..19e22de14605d 100644 --- a/unified-runtime/source/adapters/level_zero/common.hpp +++ b/unified-runtime/source/adapters/level_zero/common.hpp @@ -78,6 +78,13 @@ const int UrL0LeaksDebug = [] { return std::atoi(UrRet); }(); +const int UrL0VectorWidth = [] { + const char *UrRet = std::getenv("UR_L0_VECTOR_WIDTH_SIZE"); + if (!UrRet) + return 0; + return std::atoi(UrRet); +}(); + // Enable for UR L0 Adapter to Init all L0 Drivers on the system with filtering // in place for only currently used Drivers. const int UrL0InitAllDrivers = [] { diff --git a/unified-runtime/source/adapters/level_zero/device.cpp b/unified-runtime/source/adapters/level_zero/device.cpp index 3c415b9cf39a9..a934cdb9e661a 100644 --- a/unified-runtime/source/adapters/level_zero/device.cpp +++ b/unified-runtime/source/adapters/level_zero/device.cpp @@ -690,23 +690,36 @@ ur_result_t urDeviceGetInfo( case UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: return ReturnValue( size_t{Device->ZeDeviceImageProperties->maxImageArraySlices}); - // Handle SIMD widths, matching compute-runtime OpenCL implementation: - // https://github.com/intel/compute-runtime/blob/291745cdf76d83f5dc40e7ef41d347366235ccdb/opencl/source/cl_device/cl_device_caps.cpp#L236 case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: + return ReturnValue( + Device->ZeDeviceVectorWidthPropertiesExt->native_vector_width_char); case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: - return ReturnValue(uint32_t{16}); + return ReturnValue( + Device->ZeDeviceVectorWidthPropertiesExt->preferred_vector_width_char); case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: + return ReturnValue( + Device->ZeDeviceVectorWidthPropertiesExt->native_vector_width_short); case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT: - return ReturnValue(uint32_t{8}); + return ReturnValue( + Device->ZeDeviceVectorWidthPropertiesExt->preferred_vector_width_short); case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: + return ReturnValue( + Device->ZeDeviceVectorWidthPropertiesExt->native_vector_width_int); case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT: - return ReturnValue(uint32_t{4}); + return ReturnValue( + Device->ZeDeviceVectorWidthPropertiesExt->preferred_vector_width_int); case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: + return ReturnValue( + Device->ZeDeviceVectorWidthPropertiesExt->native_vector_width_long); case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG: - return ReturnValue(uint32_t{1}); + return ReturnValue( + Device->ZeDeviceVectorWidthPropertiesExt->preferred_vector_width_long); case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: + return ReturnValue( + Device->ZeDeviceVectorWidthPropertiesExt->native_vector_width_float); case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT: - return ReturnValue(uint32_t{1}); + return ReturnValue( + Device->ZeDeviceVectorWidthPropertiesExt->preferred_vector_width_float); case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE: // Must return 0 for *vector_width_double* if the device does not have fp64. @@ -714,11 +727,17 @@ ur_result_t urDeviceGetInfo( return ReturnValue(uint32_t{0}); return ReturnValue(uint32_t{1}); case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: + // Must return 0 for *vector_width_half* if the device does not have fp16. + if (!(Device->ZeDeviceModuleProperties->flags & ZE_DEVICE_MODULE_FLAG_FP16)) + return ReturnValue(uint32_t{0}); + return ReturnValue( + Device->ZeDeviceVectorWidthPropertiesExt->native_vector_width_half); case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF: // Must return 0 for *vector_width_half* if the device does not have fp16. if (!(Device->ZeDeviceModuleProperties->flags & ZE_DEVICE_MODULE_FLAG_FP16)) return ReturnValue(uint32_t{0}); - return ReturnValue(uint32_t{8}); + return ReturnValue( + Device->ZeDeviceVectorWidthPropertiesExt->preferred_vector_width_half); case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { // Max_num_sub_Groups = maxTotalGroupSize/min(set of subGroupSizes); uint32_t MinSubGroupSize = @@ -1857,6 +1876,67 @@ ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, }; #endif // ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME + auto UrPlatform = this->Platform; + ZeDeviceVectorWidthPropertiesExt.Compute = + [ZeDevice, UrPlatform]( + ZeStruct &Properties) { + // Set default vector width properties + Properties.preferred_vector_width_char = 16u; + Properties.preferred_vector_width_short = 8u; + Properties.preferred_vector_width_int = 4u; + Properties.preferred_vector_width_long = 1u; + Properties.preferred_vector_width_float = 1u; + Properties.preferred_vector_width_half = 8u; + Properties.native_vector_width_char = 16u; + Properties.native_vector_width_short = 8u; + Properties.native_vector_width_int = 4u; + Properties.native_vector_width_long = 1u; + Properties.native_vector_width_float = 1u; + Properties.native_vector_width_half = 8u; + + if (UrPlatform->zeDriverExtensionMap.count( + ZE_DEVICE_VECTOR_SIZES_EXT_NAME)) { + uint32_t Count = 0; + ZE_CALL_NOCHECK(zeDeviceGetVectorWidthPropertiesExt, + (ZeDevice, &Count, nullptr)); + + std::vector> + PropertiesVector; + PropertiesVector.reserve(Count); + + ZeStruct + MaxVectorWidthProperties; + + ZE_CALL_NOCHECK(zeDeviceGetVectorWidthPropertiesExt, + (ZeDevice, &Count, PropertiesVector.data())); + if (!PropertiesVector.empty()) { + // Find the largest vector_width_size property + uint32_t max_vector_width_size = 0; + for (const auto &prop : PropertiesVector) { + if (!max_vector_width_size) { + max_vector_width_size = prop.vector_width_size; + MaxVectorWidthProperties = prop; + } else if (prop.vector_width_size > max_vector_width_size) { + max_vector_width_size = prop.vector_width_size; + MaxVectorWidthProperties = prop; + } + } + Properties = MaxVectorWidthProperties; + // If the environment variable is set, use the specified vector + // width if it exists + if (UrL0VectorWidth) { + for (const auto &prop : PropertiesVector) { + if (prop.vector_width_size == + static_cast(UrL0VectorWidth)) { + Properties = prop; + break; + } + } + } + } + } + }; + ImmCommandListUsed = this->useImmediateCommandLists(); uint32_t numQueueGroups = 0; diff --git a/unified-runtime/source/adapters/level_zero/device.hpp b/unified-runtime/source/adapters/level_zero/device.hpp index 1ca19ed80cd4f..53572fbe1d6c3 100644 --- a/unified-runtime/source/adapters/level_zero/device.hpp +++ b/unified-runtime/source/adapters/level_zero/device.hpp @@ -231,6 +231,8 @@ struct ur_device_handle_t_ : ur_object { ZeCache> ZeDeviceBlockArrayProperties; #endif // ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME + ZeCache> + ZeDeviceVectorWidthPropertiesExt; // Map device bindless image offset to corresponding host image handle. std::unordered_map